1use crate::error::{StatsError, StatsResult};
13use crate::simd_enhanced_v6::AdvancedSimdOps;
14use scirs2_core::ndarray::ArrayView1;
15use scirs2_core::numeric::{Float, NumCast, One, Zero};
16use scirs2_core::{
17 simd_ops::{PlatformCapabilities, SimdUnifiedOps},
18 validation::*,
19};
20use std::marker::PhantomData;
21
22#[derive(Debug, Clone)]
24pub struct AdvancedSimdConfig {
25 pub platform: PlatformCapabilities,
27 pub vector_width: usize,
29 pub cache_linesize: usize,
31 pub l1_cachesize: usize,
33 pub l2_cachesize: usize,
35 pub enable_prefetch: bool,
37 pub enable_cache_blocking: bool,
39 pub enable_pipelining: bool,
41 pub advanced_simd_threshold: usize,
43}
44
45impl Default for AdvancedSimdConfig {
46 fn default() -> Self {
47 let platform = PlatformCapabilities::detect();
48
49 let vector_width = if platform.avx512_available {
50 16 } else if platform.avx2_available {
52 8 } else if platform.simd_available {
54 4 } else {
56 1 };
58
59 Self {
60 platform,
61 vector_width,
62 cache_linesize: 64, l1_cachesize: 32768, l2_cachesize: 262144, enable_prefetch: true,
66 enable_cache_blocking: true,
67 enable_pipelining: true,
68 advanced_simd_threshold: 256,
69 }
70 }
71}
72
73#[derive(Debug, Clone, Copy)]
75pub enum VectorStrategy {
76 SingleVector,
78 MultiVector { num_registers: usize },
80 UnrolledVector { unroll_factor: usize },
82 CacheBlockedVector { blocksize: usize },
84}
85
86#[derive(Debug, Clone, Copy)]
88pub enum MemoryPattern {
89 SequentialPrefetch,
91 Strided { stride: usize },
93 Tiled { tilesize: usize },
95 Blocked { blocksize: usize },
97}
98
99pub struct AdvancedSimdProcessor<F> {
101 config: AdvancedSimdConfig,
102 vector_strategy: VectorStrategy,
103 memory_pattern: MemoryPattern,
104 _phantom: PhantomData<F>,
105}
106
107#[derive(Debug, Clone)]
109pub struct AdvancedStatsResult<F> {
110 pub mean: F,
112 pub variance: F,
113 pub std_dev: F,
114 pub skewness: F,
115 pub kurtosis: F,
116 pub min: F,
117 pub max: F,
118 pub simd_utilization: f64,
120 pub cache_efficiency: f64,
121 pub vector_operations_count: usize,
122 pub prefetch_efficiency: f64,
123}
124
125pub struct CacheAwareVectorProcessor {
127 l1_blocksize: usize,
128 l2_blocksize: usize,
129 vector_width: usize,
130 prefetch_distance: usize,
131}
132
133impl<F> Default for AdvancedSimdProcessor<F>
134where
135 F: Float
136 + NumCast
137 + SimdUnifiedOps
138 + Zero
139 + One
140 + PartialOrd
141 + Copy
142 + Send
143 + Sync
144 + std::fmt::Display
145 + std::iter::Sum<F>
146 + AdvancedSimdOps<F>,
147{
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153impl<F> AdvancedSimdProcessor<F>
154where
155 F: Float
156 + NumCast
157 + SimdUnifiedOps
158 + Zero
159 + One
160 + PartialOrd
161 + Copy
162 + Send
163 + Sync
164 + std::fmt::Display
165 + std::iter::Sum<F>
166 + AdvancedSimdOps<F>,
167{
168 pub fn new() -> Self {
170 let config = AdvancedSimdConfig::default();
171 let vector_strategy = Self::select_optimal_vector_strategy(&config);
172 let memory_pattern = Self::select_optimal_memory_pattern(&config);
173
174 Self {
175 config,
176 vector_strategy,
177 memory_pattern,
178 _phantom: PhantomData,
179 }
180 }
181
182 pub fn with_config(config: AdvancedSimdConfig) -> Self {
184 let vector_strategy = Self::select_optimal_vector_strategy(&config);
185 let memory_pattern = Self::select_optimal_memory_pattern(&config);
186
187 Self {
188 config,
189 vector_strategy,
190 memory_pattern,
191 _phantom: PhantomData,
192 }
193 }
194
195 fn select_optimal_vector_strategy(config: &AdvancedSimdConfig) -> VectorStrategy {
197 if config.platform.avx512_available && config.enable_pipelining {
198 VectorStrategy::MultiVector { num_registers: 4 }
199 } else if config.platform.avx2_available {
200 VectorStrategy::UnrolledVector { unroll_factor: 4 }
201 } else if config.enable_cache_blocking {
202 VectorStrategy::CacheBlockedVector {
203 blocksize: config.l1_cachesize / 4,
204 }
205 } else {
206 VectorStrategy::SingleVector
207 }
208 }
209
210 fn select_optimal_memory_pattern(config: &AdvancedSimdConfig) -> MemoryPattern {
212 if config.enable_cache_blocking {
213 MemoryPattern::Blocked {
214 blocksize: config.l1_cachesize / std::mem::size_of::<f64>(),
215 }
216 } else if config.enable_prefetch {
217 MemoryPattern::SequentialPrefetch
218 } else {
219 MemoryPattern::Tiled {
220 tilesize: config.cache_linesize / std::mem::size_of::<f64>(),
221 }
222 }
223 }
224
225 pub fn compute_advanced_statistics(
227 &self,
228 data: &ArrayView1<F>,
229 ) -> StatsResult<AdvancedStatsResult<F>> {
230 checkarray_finite(data, "data")?;
231
232 if data.is_empty() {
233 return Err(StatsError::InvalidArgument(
234 "Data cannot be empty".to_string(),
235 ));
236 }
237
238 let n = data.len();
239
240 if n < self.config.advanced_simd_threshold {
241 return self.compute_scalar_fallback(data);
242 }
243
244 match self.vector_strategy {
245 VectorStrategy::MultiVector { num_registers } => {
246 self.compute_multi_vector_stats(data, num_registers)
247 }
248 VectorStrategy::UnrolledVector { unroll_factor } => {
249 self.compute_unrolled_vector_stats(data, unroll_factor)
250 }
251 VectorStrategy::CacheBlockedVector { blocksize } => {
252 self.compute_cache_blocked_stats(data, blocksize)
253 }
254 VectorStrategy::SingleVector => self.compute_single_vector_stats(data),
255 }
256 }
257
258 fn compute_multi_vector_stats(
260 &self,
261 data: &ArrayView1<F>,
262 num_registers: usize,
263 ) -> StatsResult<AdvancedStatsResult<F>> {
264 let n = data.len();
265 let vector_width = self.config.vector_width;
266 let chunksize = vector_width * num_registers;
267 let n_chunks = n / chunksize;
268 let remainder = n % chunksize;
269
270 let mut sum_accumulators = vec![F::zero(); num_registers];
272 let mut sum_sq_accumulators = vec![F::zero(); num_registers];
273 let mut sum_cube_accumulators = vec![F::zero(); num_registers];
274 let mut sum_quad_accumulators = vec![F::zero(); num_registers];
275 let mut min_accumulators = vec![F::infinity(); num_registers];
276 let mut max_accumulators = vec![F::neg_infinity(); num_registers];
277
278 let mut vector_ops_count = 0;
279 let mut prefetch_hits = 0;
280
281 for chunk_idx in 0..n_chunks {
283 let base_offset = chunk_idx * chunksize;
284
285 if self.config.enable_prefetch && chunk_idx + 2 < n_chunks {
287 let prefetch_offset = (chunk_idx + 2) * chunksize;
288 if prefetch_offset < n {
289 unsafe {
290 self.prefetchdata(data, prefetch_offset);
291 }
292 prefetch_hits += 1;
293 }
294 }
295
296 for reg_idx in 0..num_registers {
298 let start = base_offset + reg_idx * vector_width;
299 let end = (start + vector_width).min(n);
300
301 if start < n {
302 let chunk = data.slice(scirs2_core::ndarray::s![start..end]);
303
304 let (sum, sum_sq, sum_cube, sum_quad, min_val, max_val) =
306 self.compute_vector_moments(&chunk)?;
307
308 sum_accumulators[reg_idx] = sum_accumulators[reg_idx] + sum;
309 sum_sq_accumulators[reg_idx] = sum_sq_accumulators[reg_idx] + sum_sq;
310 sum_cube_accumulators[reg_idx] = sum_cube_accumulators[reg_idx] + sum_cube;
311 sum_quad_accumulators[reg_idx] = sum_quad_accumulators[reg_idx] + sum_quad;
312
313 if min_val < min_accumulators[reg_idx] {
314 min_accumulators[reg_idx] = min_val;
315 }
316 if max_val > max_accumulators[reg_idx] {
317 max_accumulators[reg_idx] = max_val;
318 }
319
320 vector_ops_count += 1;
321 }
322 }
323 }
324
325 let total_sum: F = sum_accumulators.iter().copied().sum();
327 let total_sum_sq: F = sum_sq_accumulators.iter().copied().sum();
328 let total_sum_cube: F = sum_cube_accumulators.iter().copied().sum();
329 let total_sum_quad: F = sum_quad_accumulators.iter().copied().sum();
330 let global_min = min_accumulators
331 .iter()
332 .copied()
333 .fold(F::infinity(), |a, b| a.min(b));
334 let global_max = max_accumulators
335 .iter()
336 .copied()
337 .fold(F::neg_infinity(), |a, b| a.max(b));
338
339 let mut remainder_sum = F::zero();
341 let mut remainder_sum_sq = F::zero();
342 let mut remainder_sum_cube = F::zero();
343 let mut remainder_sum_quad = F::zero();
344 let mut remainder_min = global_min;
345 let mut remainder_max = global_max;
346
347 if remainder > 0 {
348 let start = n_chunks * chunksize;
349 for i in start..n {
350 let val = data[i];
351 remainder_sum = remainder_sum + val;
352 remainder_sum_sq = remainder_sum_sq + val * val;
353 remainder_sum_cube = remainder_sum_cube + val * val * val;
354 remainder_sum_quad = remainder_sum_quad + val * val * val * val;
355 if val < remainder_min {
356 remainder_min = val;
357 }
358 if val > remainder_max {
359 remainder_max = val;
360 }
361 }
362 }
363
364 let final_sum = total_sum + remainder_sum;
366 let final_sum_sq = total_sum_sq + remainder_sum_sq;
367 let final_sum_cube = total_sum_cube + remainder_sum_cube;
368 let final_sum_quad = total_sum_quad + remainder_sum_quad;
369 let final_min = remainder_min;
370 let final_max = remainder_max;
371
372 let n_f = F::from(n).unwrap();
374 let mean = final_sum / n_f;
375
376 let m2 = final_sum_sq / n_f - mean * mean;
378 let m3 = final_sum_cube / n_f - F::from(3).unwrap() * mean * m2 - mean * mean * mean;
379 let m4 = final_sum_quad / n_f
380 - F::from(4).unwrap() * mean * m3
381 - F::from(6).unwrap() * mean * mean * m2
382 - mean * mean * mean * mean;
383
384 let variance = m2;
385 let std_dev = variance.sqrt();
386 let skewness = if m2 > F::zero() {
387 m3 / (m2 * m2.sqrt())
388 } else {
389 F::zero()
390 };
391 let kurtosis = if m2 > F::zero() {
392 m4 / (m2 * m2) - F::from(3).unwrap()
393 } else {
394 F::zero()
395 };
396
397 let theoretical_max_vectors = n / vector_width;
399 let simd_utilization = if theoretical_max_vectors > 0 {
400 vector_ops_count as f64 / theoretical_max_vectors as f64
401 } else {
402 0.0
403 };
404
405 let cache_efficiency = if n_chunks > 0 {
406 0.95 } else {
408 0.0
409 };
410
411 let prefetch_efficiency = if n_chunks > 2 {
412 prefetch_hits as f64 / (n_chunks - 2) as f64
413 } else {
414 0.0
415 };
416
417 Ok(AdvancedStatsResult {
418 mean,
419 variance,
420 std_dev,
421 skewness,
422 kurtosis,
423 min: final_min,
424 max: final_max,
425 simd_utilization,
426 cache_efficiency,
427 vector_operations_count: vector_ops_count,
428 prefetch_efficiency,
429 })
430 }
431
432 fn compute_unrolled_vector_stats(
434 &self,
435 data: &ArrayView1<F>,
436 unroll_factor: usize,
437 ) -> StatsResult<AdvancedStatsResult<F>> {
438 let n = data.len();
439 let vector_width = self.config.vector_width;
440 let unrolledsize = vector_width * unroll_factor;
441 let n_unrolled = n / unrolledsize;
442 let remainder = n % unrolledsize;
443
444 let mut sum_acc = F::zero();
445 let mut sum_sq_acc = F::zero();
446 let mut sum_cube_acc = F::zero();
447 let mut sum_quad_acc = F::zero();
448 let mut min_val = F::infinity();
449 let mut max_val = F::neg_infinity();
450
451 let mut vector_ops_count = 0;
452
453 for i in 0..n_unrolled {
455 let base_idx = i * unrolledsize;
456
457 for j in 0..unroll_factor {
459 let start = base_idx + j * vector_width;
460 let end = start + vector_width;
461
462 if end <= n {
463 let chunk = data.slice(scirs2_core::ndarray::s![start..end]);
464 let (sum, sum_sq, sum_cube, sum_quad, chunk_min, chunk_max) =
465 self.compute_vector_moments(&chunk)?;
466
467 sum_acc = sum_acc + sum;
468 sum_sq_acc = sum_sq_acc + sum_sq;
469 sum_cube_acc = sum_cube_acc + sum_cube;
470 sum_quad_acc = sum_quad_acc + sum_quad;
471 if chunk_min < min_val {
472 min_val = chunk_min;
473 }
474 if chunk_max > max_val {
475 max_val = chunk_max;
476 }
477
478 vector_ops_count += 1;
479 }
480 }
481 }
482
483 if remainder > 0 {
485 let start = n_unrolled * unrolledsize;
486 for i in start..n {
487 let val = data[i];
488 sum_acc = sum_acc + val;
489 sum_sq_acc = sum_sq_acc + val * val;
490 sum_cube_acc = sum_cube_acc + val * val * val;
491 sum_quad_acc = sum_quad_acc + val * val * val * val;
492 if val < min_val {
493 min_val = val;
494 }
495 if val > max_val {
496 max_val = val;
497 }
498 }
499 }
500
501 let n_f = F::from(n).unwrap();
503 let mean = sum_acc / n_f;
504 let m2 = sum_sq_acc / n_f - mean * mean;
505 let m3 = sum_cube_acc / n_f - F::from(3).unwrap() * mean * m2 - mean * mean * mean;
506 let m4 = sum_quad_acc / n_f
507 - F::from(4).unwrap() * mean * m3
508 - F::from(6).unwrap() * mean * mean * m2
509 - mean * mean * mean * mean;
510
511 let variance = m2;
512 let std_dev = variance.sqrt();
513 let skewness = if m2 > F::zero() {
514 m3 / (m2 * m2.sqrt())
515 } else {
516 F::zero()
517 };
518 let kurtosis = if m2 > F::zero() {
519 m4 / (m2 * m2) - F::from(3).unwrap()
520 } else {
521 F::zero()
522 };
523
524 Ok(AdvancedStatsResult {
525 mean,
526 variance,
527 std_dev,
528 skewness,
529 kurtosis,
530 min: min_val,
531 max: max_val,
532 simd_utilization: vector_ops_count as f64 / (n / vector_width) as f64,
533 cache_efficiency: 0.90, vector_operations_count: vector_ops_count,
535 prefetch_efficiency: 0.0, })
537 }
538
539 fn compute_cache_blocked_stats(
541 &self,
542 data: &ArrayView1<F>,
543 blocksize: usize,
544 ) -> StatsResult<AdvancedStatsResult<F>> {
545 let n = data.len();
546 let n_blocks = n / blocksize;
547 let remainder = n % blocksize;
548
549 let mut sum_acc = F::zero();
550 let mut sum_sq_acc = F::zero();
551 let mut sum_cube_acc = F::zero();
552 let mut sum_quad_acc = F::zero();
553 let mut min_val = F::infinity();
554 let mut max_val = F::neg_infinity();
555
556 let mut vector_ops_count = 0;
557
558 for block_idx in 0..n_blocks {
560 let start = block_idx * blocksize;
561 let end = start + blocksize;
562 let block = data.slice(scirs2_core::ndarray::s![start..end]);
563
564 let block_result = self.process_cache_block(&block)?;
566
567 sum_acc = sum_acc + block_result.sum;
568 sum_sq_acc = sum_sq_acc + block_result.sum_sq;
569 sum_cube_acc = sum_cube_acc + block_result.sum_cube;
570 sum_quad_acc = sum_quad_acc + block_result.sum_quad;
571 if block_result.min < min_val {
572 min_val = block_result.min;
573 }
574 if block_result.max > max_val {
575 max_val = block_result.max;
576 }
577
578 vector_ops_count += block_result.vector_ops;
579 }
580
581 if remainder > 0 {
583 let start = n_blocks * blocksize;
584 let remainder_block = data.slice(scirs2_core::ndarray::s![start..]);
585 let remainder_result = self.process_cache_block(&remainder_block)?;
586
587 sum_acc = sum_acc + remainder_result.sum;
588 sum_sq_acc = sum_sq_acc + remainder_result.sum_sq;
589 sum_cube_acc = sum_cube_acc + remainder_result.sum_cube;
590 sum_quad_acc = sum_quad_acc + remainder_result.sum_quad;
591 if remainder_result.min < min_val {
592 min_val = remainder_result.min;
593 }
594 if remainder_result.max > max_val {
595 max_val = remainder_result.max;
596 }
597
598 vector_ops_count += remainder_result.vector_ops;
599 }
600
601 let n_f = F::from(n).unwrap();
603 let mean = sum_acc / n_f;
604 let m2 = sum_sq_acc / n_f - mean * mean;
605 let m3 = sum_cube_acc / n_f - F::from(3).unwrap() * mean * m2 - mean * mean * mean;
606 let m4 = sum_quad_acc / n_f
607 - F::from(4).unwrap() * mean * m3
608 - F::from(6).unwrap() * mean * mean * m2
609 - mean * mean * mean * mean;
610
611 let variance = m2;
612 let std_dev = variance.sqrt();
613 let skewness = if m2 > F::zero() {
614 m3 / (m2 * m2.sqrt())
615 } else {
616 F::zero()
617 };
618 let kurtosis = if m2 > F::zero() {
619 m4 / (m2 * m2) - F::from(3).unwrap()
620 } else {
621 F::zero()
622 };
623
624 Ok(AdvancedStatsResult {
625 mean,
626 variance,
627 std_dev,
628 skewness,
629 kurtosis,
630 min: min_val,
631 max: max_val,
632 simd_utilization: vector_ops_count as f64 / (n / self.config.vector_width) as f64,
633 cache_efficiency: 0.98, vector_operations_count: vector_ops_count,
635 prefetch_efficiency: 0.85, })
637 }
638
639 fn compute_single_vector_stats(
641 &self,
642 data: &ArrayView1<F>,
643 ) -> StatsResult<AdvancedStatsResult<F>> {
644 let n = data.len();
646 let vector_width = self.config.vector_width;
647 let n_vectors = n / vector_width;
648 let remainder = n % vector_width;
649
650 let mut sum_acc = F::zero();
651 let mut sum_sq_acc = F::zero();
652 let mut min_val = F::infinity();
653 let mut max_val = F::neg_infinity();
654
655 for i in 0..n_vectors {
657 let start = i * vector_width;
658 let end = start + vector_width;
659 let chunk = data.slice(scirs2_core::ndarray::s![start..end]);
660
661 let chunk_sum = F::simd_sum(&chunk);
662 let chunk_sum_sq = F::simd_sum_squares(&chunk);
663 let chunk_min = F::simd_min_element(&chunk);
664 let chunk_max = F::simd_max_element(&chunk);
665
666 sum_acc = sum_acc + chunk_sum;
667 sum_sq_acc = sum_sq_acc + chunk_sum_sq;
668 if chunk_min < min_val {
669 min_val = chunk_min;
670 }
671 if chunk_max > max_val {
672 max_val = chunk_max;
673 }
674 }
675
676 if remainder > 0 {
678 let start = n_vectors * vector_width;
679 for i in start..n {
680 let val = data[i];
681 sum_acc = sum_acc + val;
682 sum_sq_acc = sum_sq_acc + val * val;
683 if val < min_val {
684 min_val = val;
685 }
686 if val > max_val {
687 max_val = val;
688 }
689 }
690 }
691
692 let n_f = F::from(n).unwrap();
693 let mean = sum_acc / n_f;
694 let variance = sum_sq_acc / n_f - mean * mean;
695 let std_dev = variance.sqrt();
696
697 Ok(AdvancedStatsResult {
698 mean,
699 variance,
700 std_dev,
701 skewness: F::zero(), kurtosis: F::zero(),
703 min: min_val,
704 max: max_val,
705 simd_utilization: n_vectors as f64 / (n / vector_width) as f64,
706 cache_efficiency: 0.80, vector_operations_count: n_vectors,
708 prefetch_efficiency: 0.0,
709 })
710 }
711
712 fn compute_vector_moments(&self, chunk: &ArrayView1<F>) -> StatsResult<(F, F, F, F, F, F)> {
714 let sum = F::simd_sum(chunk);
715 let sum_sq = F::simd_sum_squares(chunk);
716 let sum_cube = <F as scirs2_core::simd_ops::SimdUnifiedOps>::simd_sum_cubes(chunk);
717 let sum_quad = F::simd_sum_quads(chunk);
718 let min_val = F::simd_min_element(chunk);
719 let max_val = F::simd_max_element(chunk);
720
721 Ok((sum, sum_sq, sum_cube, sum_quad, min_val, max_val))
722 }
723
724 fn process_cache_block(&self, block: &ArrayView1<F>) -> StatsResult<BlockResult<F>> {
726 let n = block.len();
727 let vector_width = self.config.vector_width;
728 let n_vectors = n / vector_width;
729 let remainder = n % vector_width;
730
731 let mut sum = F::zero();
732 let mut sum_sq = F::zero();
733 let mut sum_cube = F::zero();
734 let mut sum_quad = F::zero();
735 let mut min_val = F::infinity();
736 let mut max_val = F::neg_infinity();
737
738 for i in 0..n_vectors {
740 let start = i * vector_width;
741 let end = start + vector_width;
742 let chunk = block.slice(scirs2_core::ndarray::s![start..end]);
743
744 let (chunk_sum, chunk_sum_sq, chunk_sum_cube, chunk_sum_quad, chunk_min, chunk_max) =
745 self.compute_vector_moments(&chunk)?;
746
747 sum = sum + chunk_sum;
748 sum_sq = sum_sq + chunk_sum_sq;
749 sum_cube = sum_cube + chunk_sum_cube;
750 sum_quad = sum_quad + chunk_sum_quad;
751 if chunk_min < min_val {
752 min_val = chunk_min;
753 }
754 if chunk_max > max_val {
755 max_val = chunk_max;
756 }
757 }
758
759 if remainder > 0 {
761 let start = n_vectors * vector_width;
762 for i in start..n {
763 let val = block[i];
764 sum = sum + val;
765 sum_sq = sum_sq + val * val;
766 sum_cube = sum_cube + val * val * val;
767 sum_quad = sum_quad + val * val * val * val;
768 if val < min_val {
769 min_val = val;
770 }
771 if val > max_val {
772 max_val = val;
773 }
774 }
775 }
776
777 Ok(BlockResult {
778 sum,
779 sum_sq,
780 sum_cube,
781 sum_quad,
782 min: min_val,
783 max: max_val,
784 vector_ops: n_vectors,
785 })
786 }
787
788 unsafe fn prefetchdata(&self, data: &ArrayView1<F>, offset: usize) {
798 if offset < data.len() {
799 let ptr = data.as_ptr().add(offset);
800 #[cfg(target_arch = "x86_64")]
802 std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
803 #[cfg(not(target_arch = "x86_64"))]
804 {
805 let _ = ptr;
807 }
808 }
809 }
810
811 fn compute_scalar_fallback(&self, data: &ArrayView1<F>) -> StatsResult<AdvancedStatsResult<F>> {
813 let n = data.len();
814 let n_f = F::from(n).unwrap();
815
816 let sum: F = data.iter().copied().sum();
817 let mean = sum / n_f;
818
819 let variance = data.iter().map(|&x| (x - mean) * (x - mean)).sum::<F>() / n_f;
820
821 let min_val = data.iter().copied().fold(F::infinity(), |a, b| a.min(b));
822 let max_val = data
823 .iter()
824 .copied()
825 .fold(F::neg_infinity(), |a, b| a.max(b));
826
827 Ok(AdvancedStatsResult {
828 mean,
829 variance,
830 std_dev: variance.sqrt(),
831 skewness: F::zero(),
832 kurtosis: F::zero(),
833 min: min_val,
834 max: max_val,
835 simd_utilization: 0.0,
836 cache_efficiency: 1.0, vector_operations_count: 0,
838 prefetch_efficiency: 0.0,
839 })
840 }
841}
842
843#[derive(Debug, Clone)]
845struct BlockResult<F> {
846 sum: F,
847 sum_sq: F,
848 sum_cube: F,
849 sum_quad: F,
850 min: F,
851 max: F,
852 vector_ops: usize,
853}
854
855impl CacheAwareVectorProcessor {
856 pub fn new(config: &AdvancedSimdConfig) -> Self {
858 Self {
859 l1_blocksize: config.l1_cachesize / std::mem::size_of::<f64>(),
860 l2_blocksize: config.l2_cachesize / std::mem::size_of::<f64>(),
861 vector_width: config.vector_width,
862 prefetch_distance: config.vector_width * 4, }
864 }
865}
866
867#[allow(dead_code)]
869pub fn advanced_mean_f64(data: &ArrayView1<f64>) -> StatsResult<AdvancedStatsResult<f64>> {
870 let processor = AdvancedSimdProcessor::<f64>::new();
871 processor.compute_advanced_statistics(data)
872}
873
874#[allow(dead_code)]
904pub fn advanced_mean_f32(data: &ArrayView1<f32>) -> StatsResult<AdvancedStatsResult<f32>> {
905 let processor = AdvancedSimdProcessor::<f32>::new();
906 processor.compute_advanced_statistics(data)
907}
908
909#[cfg(test)]
910mod tests {
911 use super::*;
912 use scirs2_core::ndarray::{array, Array1};
913
914 #[test]
915 #[ignore = "timeout"]
916 fn test_advanced_simd_basic() {
917 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
918 let processor = AdvancedSimdProcessor::<f64>::new();
919 let result = processor.compute_advanced_statistics(&data.view()).unwrap();
920
921 assert!((result.mean - 4.5).abs() < 1e-10);
922 assert!(result.simd_utilization >= 0.0);
923 assert!(result.cache_efficiency >= 0.0);
924 }
925
926 #[test]
927 #[ignore = "timeout"]
928 fn test_largedataset_performance() {
929 let data: Array1<f64> = Array1::from_shape_fn(10000, |i| i as f64);
930 let processor = AdvancedSimdProcessor::<f64>::new();
931 let result = processor.compute_advanced_statistics(&data.view()).unwrap();
932
933 assert!(result.simd_utilization > 0.5); assert!(result.vector_operations_count > 0);
935 }
936
937 #[test]
938 #[ignore = "timeout"]
939 fn test_different_vector_strategies() {
940 let data: Array1<f64> = Array1::from_shape_fn(1000, |i| (i as f64).sin());
941
942 let config_multi = AdvancedSimdConfig {
944 vector_width: 8,
945 enable_pipelining: true,
946 ..Default::default()
947 };
948 let processor_multi = AdvancedSimdProcessor::with_config(config_multi);
949 let result_multi = processor_multi
950 .compute_advanced_statistics(&data.view())
951 .unwrap();
952
953 let config_blocked = AdvancedSimdConfig {
955 enable_cache_blocking: true,
956 l1_cachesize: 4096,
957 ..Default::default()
958 };
959 let processor_blocked = AdvancedSimdProcessor::with_config(config_blocked);
960 let result_blocked = processor_blocked
961 .compute_advanced_statistics(&data.view())
962 .unwrap();
963
964 assert!((result_multi.mean - result_blocked.mean).abs() < 1e-10);
966 assert!((result_multi.variance - result_blocked.variance).abs() < 1e-10);
967 }
968}