Skip to main content

scirs2_transform/
normalize_simd.rs

1//! SIMD-accelerated normalization operations
2//!
3//! This module provides SIMD-optimized implementations of normalization operations
4//! using the unified SIMD operations from scirs2-core.
5//!
6//! Features:
7//! - Adaptive block sizing based on data dimensions and cache sizes
8//! - Memory-aligned processing for maximum SIMD efficiency
9//! - Cache-optimal memory access patterns
10//! - Advanced prefetching strategies for large datasets
11
12use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
13use scirs2_core::numeric::{Float, NumCast};
14use scirs2_core::simd_ops::SimdUnifiedOps;
15
16use crate::error::{Result, TransformError};
17use crate::normalize::{NormalizationMethod, EPSILON};
18
19/// Cache line size for optimal memory access
20const CACHE_LINE_SIZE: usize = 64;
21/// L1 cache size estimate for block sizing
22const L1_CACHE_SIZE: usize = 32_768;
23/// L2 cache size estimate for adaptive processing
24const L2_CACHE_SIZE: usize = 262_144;
25
26/// Adaptive block sizing strategy for cache-optimal processing
27#[derive(Debug, Clone)]
28pub struct AdaptiveBlockSizer {
29    /// Optimal block size for current data dimensions
30    pub optimal_block_size: usize,
31    /// Whether to use cache-aligned processing
32    pub use_cache_alignment: bool,
33    /// Prefetch distance for memory optimization
34    pub prefetch_distance: usize,
35}
36
37impl AdaptiveBlockSizer {
38    /// Create adaptive block sizer based on data characteristics
39    pub fn new<F>(datashape: &[usize]) -> Self
40    where
41        F: Float + NumCast,
42    {
43        let element_size = std::mem::size_of::<F>();
44        let data_size = datashape.iter().product::<usize>() * element_size;
45
46        // Adaptive block sizing based on data size and cache characteristics
47        let optimal_block_size = if data_size <= L1_CACHE_SIZE / 4 {
48            // Small data: use larger blocks for better vectorization
49            256
50        } else if data_size <= L2_CACHE_SIZE / 2 {
51            // Medium data: balance cache efficiency and parallelism
52            128
53        } else {
54            // Large data: smaller blocks to maintain cache locality
55            64
56        };
57
58        let use_cache_alignment = data_size > L1_CACHE_SIZE;
59        let prefetch_distance = if data_size > L2_CACHE_SIZE { 8 } else { 4 };
60
61        AdaptiveBlockSizer {
62            optimal_block_size,
63            use_cache_alignment,
64            prefetch_distance,
65        }
66    }
67
68    /// Get cache-aligned block size
69    pub fn get_aligned_block_size(&self, dimensionsize: usize) -> usize {
70        if self.use_cache_alignment {
71            // Align to cache line boundaries
72            let cache_aligned =
73                (self.optimal_block_size + CACHE_LINE_SIZE - 1) & !(CACHE_LINE_SIZE - 1);
74            cache_aligned.min(dimensionsize)
75        } else {
76            self.optimal_block_size.min(dimensionsize)
77        }
78    }
79}
80
81/// SIMD-accelerated min-max normalization for 1D arrays
82#[allow(dead_code)]
83pub fn simd_minmax_normalize_1d<F>(array: &Array1<F>) -> Result<Array1<F>>
84where
85    F: Float + NumCast + SimdUnifiedOps,
86{
87    if array.is_empty() {
88        return Err(TransformError::InvalidInput(
89            "Input array is empty".to_string(),
90        ));
91    }
92
93    let min = F::simd_min_element(&array.view());
94    let max = F::simd_max_element(&array.view());
95    let range = max - min;
96
97    if range.abs() <= F::from(EPSILON).expect("Failed to convert to float") {
98        // Constant feature, return array of 0.5
99        return Ok(Array1::from_elem(
100            array.len(),
101            F::from(0.5).expect("Failed to convert constant to float"),
102        ));
103    }
104
105    // Normalize: (x - min) / range
106    let minarray = Array1::from_elem(array.len(), min);
107    let normalized = F::simd_sub(&array.view(), &minarray.view());
108    let rangearray = Array1::from_elem(array.len(), range);
109    let result = F::simd_div(&normalized.view(), &rangearray.view());
110
111    Ok(result)
112}
113
114/// SIMD-accelerated Z-score normalization for 1D arrays
115#[allow(dead_code)]
116pub fn simd_zscore_normalize_1d<F>(array: &Array1<F>) -> Result<Array1<F>>
117where
118    F: Float + NumCast + SimdUnifiedOps,
119{
120    if array.is_empty() {
121        return Err(TransformError::InvalidInput(
122            "Input array is empty".to_string(),
123        ));
124    }
125
126    let mean = F::simd_mean(&array.view());
127    let n = F::from(array.len()).expect("Operation failed");
128
129    // Compute variance
130    let meanarray = Array1::from_elem(array.len(), mean);
131    let centered = F::simd_sub(&array.view(), &meanarray.view());
132    let squared = F::simd_mul(&centered.view(), &centered.view());
133    let variance = F::simd_sum(&squared.view()) / n;
134    let std_dev = variance.sqrt();
135
136    if std_dev <= F::from(EPSILON).expect("Failed to convert to float") {
137        // Constant feature, return zeros
138        return Ok(Array1::zeros(array.len()));
139    }
140
141    // Normalize: (x - mean) / std_dev
142    let stdarray = Array1::from_elem(array.len(), std_dev);
143    let result = F::simd_div(&centered.view(), &stdarray.view());
144
145    Ok(result)
146}
147
148/// SIMD-accelerated L2 normalization for 1D arrays
149#[allow(dead_code)]
150pub fn simd_l2_normalize_1d<F>(array: &Array1<F>) -> Result<Array1<F>>
151where
152    F: Float + NumCast + SimdUnifiedOps,
153{
154    if array.is_empty() {
155        return Err(TransformError::InvalidInput(
156            "Input array is empty".to_string(),
157        ));
158    }
159
160    let l2_norm = F::simd_norm(&array.view());
161
162    if l2_norm <= F::from(EPSILON).expect("Failed to convert to float") {
163        // Zero vector, return zeros
164        return Ok(Array1::zeros(array.len()));
165    }
166
167    // Normalize: x / l2_norm
168    let normarray = Array1::from_elem(array.len(), l2_norm);
169    let result = F::simd_div(&array.view(), &normarray.view());
170
171    Ok(result)
172}
173
174/// SIMD-accelerated max absolute scaling for 1D arrays
175#[allow(dead_code)]
176pub fn simd_maxabs_normalize_1d<F>(array: &Array1<F>) -> Result<Array1<F>>
177where
178    F: Float + NumCast + SimdUnifiedOps,
179{
180    if array.is_empty() {
181        return Err(TransformError::InvalidInput(
182            "Input array is empty".to_string(),
183        ));
184    }
185
186    let absarray = F::simd_abs(&array.view());
187    let max_abs = F::simd_max_element(&absarray.view());
188
189    if max_abs <= F::from(EPSILON).expect("Failed to convert to float") {
190        // All zeros, return zeros
191        return Ok(Array1::zeros(array.len()));
192    }
193
194    // Normalize: x / max_abs
195    let max_absarray = Array1::from_elem(array.len(), max_abs);
196    let result = F::simd_div(&array.view(), &max_absarray.view());
197
198    Ok(result)
199}
200
201/// Advanced SIMD-accelerated normalization for 2D arrays with optimized memory access patterns
202#[allow(dead_code)]
203pub fn simd_normalizearray<S, F>(
204    array: &ArrayBase<S, Ix2>,
205    method: NormalizationMethod,
206    axis: usize,
207) -> Result<Array2<F>>
208where
209    S: Data<Elem = F>,
210    F: Float + NumCast + SimdUnifiedOps,
211{
212    if !array.is_standard_layout() {
213        return Err(TransformError::InvalidInput(
214            "Input array must be in standard memory layout".to_string(),
215        ));
216    }
217
218    if array.ndim() != 2 {
219        return Err(TransformError::InvalidInput(
220            "Only 2D arrays are supported".to_string(),
221        ));
222    }
223
224    if axis >= array.ndim() {
225        return Err(TransformError::InvalidInput(format!(
226            "Invalid axis {} for array with {} dimensions",
227            axis,
228            array.ndim()
229        )));
230    }
231
232    let shape = array.shape();
233    let mut normalized = Array2::zeros((shape[0], shape[1]));
234
235    match method {
236        NormalizationMethod::MinMax => simd_normalize_block_minmax(array, &mut normalized, axis)?,
237        NormalizationMethod::ZScore => simd_normalize_block_zscore(array, &mut normalized, axis)?,
238        NormalizationMethod::L2 => simd_normalize_block_l2(array, &mut normalized, axis)?,
239        NormalizationMethod::MaxAbs => simd_normalize_block_maxabs(array, &mut normalized, axis)?,
240        _ => {
241            // Fall back to non-SIMD implementation for other methods
242            return Err(TransformError::InvalidInput(
243                "SIMD implementation not available for this normalization method".to_string(),
244            ));
245        }
246    }
247
248    Ok(normalized)
249}
250
251/// Block-wise SIMD min-max normalization with optimized memory access
252#[allow(dead_code)]
253fn simd_normalize_block_minmax<S, F>(
254    array: &ArrayBase<S, Ix2>,
255    normalized: &mut Array2<F>,
256    axis: usize,
257) -> Result<()>
258where
259    S: Data<Elem = F>,
260    F: Float + NumCast + SimdUnifiedOps,
261{
262    let shape = array.shape();
263    let block_sizer = AdaptiveBlockSizer::new::<F>(&[shape[0], shape[1]]);
264    let block_size =
265        block_sizer.get_aligned_block_size(if axis == 0 { shape[1] } else { shape[0] });
266
267    if axis == 0 {
268        // Column-wise normalization with block processing
269        let mut global_mins = Array1::zeros(shape[1]);
270        let mut global_maxs = Array1::zeros(shape[1]);
271
272        // First pass: compute global min/max for each column in blocks
273        for block_start in (0..shape[1]).step_by(block_size) {
274            let block_end = (block_start + block_size).min(shape[1]);
275
276            for j in block_start..block_end {
277                let col = array.column(j);
278                let colarray = col.to_owned();
279                global_mins[j] = F::simd_min_element(&colarray.view());
280                global_maxs[j] = F::simd_max_element(&colarray.view());
281            }
282        }
283
284        // Second pass: normalize using pre-computed min/max
285        for block_start in (0..shape[1]).step_by(block_size) {
286            let block_end = (block_start + block_size).min(shape[1]);
287
288            for j in block_start..block_end {
289                let col = array.column(j);
290                let colarray = col.to_owned();
291                let range = global_maxs[j] - global_mins[j];
292
293                if range.abs() <= F::from(EPSILON).expect("Failed to convert to float") {
294                    // Constant feature
295                    for i in 0..shape[0] {
296                        normalized[[i, j]] =
297                            F::from(0.5).expect("Failed to convert constant to float");
298                    }
299                } else {
300                    // Vectorized normalization
301                    let minarray = Array1::from_elem(shape[0], global_mins[j]);
302                    let rangearray = Array1::from_elem(shape[0], range);
303                    let centered = F::simd_sub(&colarray.view(), &minarray.view());
304                    let norm_col = F::simd_div(&centered.view(), &rangearray.view());
305
306                    for i in 0..shape[0] {
307                        normalized[[i, j]] = norm_col[i];
308                    }
309                }
310            }
311        }
312    } else {
313        // Row-wise normalization with block processing
314        for block_start in (0..shape[0]).step_by(block_size) {
315            let block_end = (block_start + block_size).min(shape[0]);
316
317            for i in block_start..block_end {
318                let row = array.row(i);
319                let rowarray = row.to_owned();
320                let norm_row = simd_minmax_normalize_1d(&rowarray)?;
321
322                for j in 0..shape[1] {
323                    normalized[[i, j]] = norm_row[j];
324                }
325            }
326        }
327    }
328    Ok(())
329}
330
331/// Block-wise SIMD Z-score normalization with optimized memory access
332#[allow(dead_code)]
333fn simd_normalize_block_zscore<S, F>(
334    array: &ArrayBase<S, Ix2>,
335    normalized: &mut Array2<F>,
336    axis: usize,
337) -> Result<()>
338where
339    S: Data<Elem = F>,
340    F: Float + NumCast + SimdUnifiedOps,
341{
342    let shape = array.shape();
343    let block_sizer = AdaptiveBlockSizer::new::<F>(&[shape[0], shape[1]]);
344    let block_size =
345        block_sizer.get_aligned_block_size(if axis == 0 { shape[1] } else { shape[0] });
346
347    if axis == 0 {
348        // Column-wise normalization
349        let mut global_means = Array1::zeros(shape[1]);
350        let mut global_stds = Array1::zeros(shape[1]);
351        let n_samples_f = F::from(shape[0]).expect("Failed to convert to float");
352
353        // First pass: compute means and standard deviations
354        for block_start in (0..shape[1]).step_by(block_size) {
355            let block_end = (block_start + block_size).min(shape[1]);
356
357            for j in block_start..block_end {
358                let col = array.column(j);
359                let colarray = col.to_owned();
360
361                // Compute mean
362                global_means[j] = F::simd_sum(&colarray.view()) / n_samples_f;
363
364                // Compute standard deviation
365                let meanarray = Array1::from_elem(shape[0], global_means[j]);
366                let centered = F::simd_sub(&colarray.view(), &meanarray.view());
367                let squared = F::simd_mul(&centered.view(), &centered.view());
368                let variance = F::simd_sum(&squared.view()) / n_samples_f;
369                global_stds[j] = variance.sqrt();
370
371                // Avoid division by zero
372                if global_stds[j] <= F::from(EPSILON).expect("Failed to convert to float") {
373                    global_stds[j] = F::one();
374                }
375            }
376        }
377
378        // Second pass: normalize using pre-computed statistics
379        for block_start in (0..shape[1]).step_by(block_size) {
380            let block_end = (block_start + block_size).min(shape[1]);
381
382            for j in block_start..block_end {
383                let col = array.column(j);
384                let colarray = col.to_owned();
385
386                if global_stds[j] <= F::from(EPSILON).expect("Failed to convert to float") {
387                    // Constant feature
388                    for i in 0..shape[0] {
389                        normalized[[i, j]] = F::zero();
390                    }
391                } else {
392                    // Vectorized normalization
393                    let meanarray = Array1::from_elem(shape[0], global_means[j]);
394                    let stdarray = Array1::from_elem(shape[0], global_stds[j]);
395                    let centered = F::simd_sub(&colarray.view(), &meanarray.view());
396                    let norm_col = F::simd_div(&centered.view(), &stdarray.view());
397
398                    for i in 0..shape[0] {
399                        normalized[[i, j]] = norm_col[i];
400                    }
401                }
402            }
403        }
404    } else {
405        // Row-wise normalization with block processing
406        for block_start in (0..shape[0]).step_by(block_size) {
407            let block_end = (block_start + block_size).min(shape[0]);
408
409            for i in block_start..block_end {
410                let row = array.row(i);
411                let rowarray = row.to_owned();
412                let norm_row = simd_zscore_normalize_1d(&rowarray)?;
413
414                for j in 0..shape[1] {
415                    normalized[[i, j]] = norm_row[j];
416                }
417            }
418        }
419    }
420    Ok(())
421}
422
423/// Block-wise SIMD L2 normalization with optimized memory access
424#[allow(dead_code)]
425fn simd_normalize_block_l2<S, F>(
426    array: &ArrayBase<S, Ix2>,
427    normalized: &mut Array2<F>,
428    axis: usize,
429) -> Result<()>
430where
431    S: Data<Elem = F>,
432    F: Float + NumCast + SimdUnifiedOps,
433{
434    let shape = array.shape();
435    let block_sizer = AdaptiveBlockSizer::new::<F>(&[shape[0], shape[1]]);
436    let block_size =
437        block_sizer.get_aligned_block_size(if axis == 0 { shape[1] } else { shape[0] });
438
439    if axis == 0 {
440        // Column-wise L2 normalization
441        for block_start in (0..shape[1]).step_by(block_size) {
442            let block_end = (block_start + block_size).min(shape[1]);
443
444            for j in block_start..block_end {
445                let col = array.column(j);
446                let colarray = col.to_owned();
447                let norm_col = simd_l2_normalize_1d(&colarray)?;
448
449                for i in 0..shape[0] {
450                    normalized[[i, j]] = norm_col[i];
451                }
452            }
453        }
454    } else {
455        // Row-wise L2 normalization with SIMD optimization
456        for block_start in (0..shape[0]).step_by(block_size) {
457            let block_end = (block_start + block_size).min(shape[0]);
458
459            for i in block_start..block_end {
460                let row = array.row(i);
461                let rowarray = row.to_owned();
462                let l2_norm = F::simd_norm(&rowarray.view());
463
464                if l2_norm <= F::from(EPSILON).expect("Failed to convert to float") {
465                    // Zero vector
466                    for j in 0..shape[1] {
467                        normalized[[i, j]] = F::zero();
468                    }
469                } else {
470                    // Vectorized division
471                    let normarray = Array1::from_elem(shape[1], l2_norm);
472                    let norm_row = F::simd_div(&rowarray.view(), &normarray.view());
473
474                    for j in 0..shape[1] {
475                        normalized[[i, j]] = norm_row[j];
476                    }
477                }
478            }
479        }
480    }
481    Ok(())
482}
483
484/// Block-wise SIMD max-absolute normalization with optimized memory access
485#[allow(dead_code)]
486fn simd_normalize_block_maxabs<S, F>(
487    array: &ArrayBase<S, Ix2>,
488    normalized: &mut Array2<F>,
489    axis: usize,
490) -> Result<()>
491where
492    S: Data<Elem = F>,
493    F: Float + NumCast + SimdUnifiedOps,
494{
495    let shape = array.shape();
496    let block_sizer = AdaptiveBlockSizer::new::<F>(&[shape[0], shape[1]]);
497    let block_size =
498        block_sizer.get_aligned_block_size(if axis == 0 { shape[1] } else { shape[0] });
499
500    if axis == 0 {
501        // Column-wise max-abs normalization
502        for block_start in (0..shape[1]).step_by(block_size) {
503            let block_end = (block_start + block_size).min(shape[1]);
504
505            for j in block_start..block_end {
506                let col = array.column(j);
507                let colarray = col.to_owned();
508                let norm_col = simd_maxabs_normalize_1d(&colarray)?;
509
510                for i in 0..shape[0] {
511                    normalized[[i, j]] = norm_col[i];
512                }
513            }
514        }
515    } else {
516        // Row-wise max-abs normalization with SIMD optimization
517        for block_start in (0..shape[0]).step_by(block_size) {
518            let block_end = (block_start + block_size).min(shape[0]);
519
520            for i in block_start..block_end {
521                let row = array.row(i);
522                let rowarray = row.to_owned();
523                let absarray = F::simd_abs(&rowarray.view());
524                let max_abs = F::simd_max_element(&absarray.view());
525
526                if max_abs <= F::from(EPSILON).expect("Failed to convert to float") {
527                    // All zeros
528                    for j in 0..shape[1] {
529                        normalized[[i, j]] = F::zero();
530                    }
531                } else {
532                    // Vectorized division
533                    let max_absarray = Array1::from_elem(shape[1], max_abs);
534                    let norm_row = F::simd_div(&rowarray.view(), &max_absarray.view());
535
536                    for j in 0..shape[1] {
537                        normalized[[i, j]] = norm_row[j];
538                    }
539                }
540            }
541        }
542    }
543    Ok(())
544}
545
546/// Advanced SIMD normalization with automatic optimization selection
547#[allow(dead_code)]
548pub fn simd_normalize_adaptive<S, F>(
549    array: &ArrayBase<S, Ix2>,
550    method: NormalizationMethod,
551    axis: usize,
552) -> Result<Array2<F>>
553where
554    S: Data<Elem = F>,
555    F: Float + NumCast + SimdUnifiedOps,
556{
557    let shape = array.shape();
558    let data_size = shape[0] * shape[1] * std::mem::size_of::<F>();
559
560    // Choose optimal strategy based on data characteristics
561    if data_size > L2_CACHE_SIZE {
562        // Large data: use chunked processing with memory optimization
563        simd_normalize_chunked(array, method, axis)
564    } else if shape[0] > shape[1] * 4 {
565        // Tall matrix: optimize for column-wise operations
566        simd_normalize_optimized_tall(array, method, axis)
567    } else if shape[1] > shape[0] * 4 {
568        // Wide matrix: optimize for row-wise operations
569        simd_normalize_optimized_wide(array, method, axis)
570    } else {
571        // Standard processing
572        simd_normalizearray(array, method, axis)
573    }
574}
575
576/// Memory-efficient batch processing for advanced-large datasets
577#[allow(dead_code)]
578pub fn simd_normalize_batch<S, F>(
579    array: &ArrayBase<S, Ix2>,
580    method: NormalizationMethod,
581    axis: usize,
582    batch_size_mb: usize,
583) -> Result<Array2<F>>
584where
585    S: Data<Elem = F>,
586    F: Float + NumCast + SimdUnifiedOps,
587{
588    let shape = array.shape();
589    let element_size = std::mem::size_of::<F>();
590    let max_elements_per_batch = (batch_size_mb * 1024 * 1024) / element_size;
591
592    if shape[0] * shape[1] <= max_elements_per_batch {
593        // Small enough for single batch
594        return simd_normalize_adaptive(array, method, axis);
595    }
596
597    let mut normalized = Array2::zeros((shape[0], shape[1]));
598
599    if axis == 0 {
600        // Column-wise: batch by columns
601        let cols_per_batch = max_elements_per_batch / shape[0];
602        for col_start in (0..shape[1]).step_by(cols_per_batch) {
603            let col_end = (col_start + cols_per_batch).min(shape[1]);
604            let batch_view = array.slice(scirs2_core::ndarray::s![.., col_start..col_end]);
605            let batch_normalized = simd_normalize_adaptive(&batch_view, method, axis)?;
606
607            for (j_local, j_global) in (col_start..col_end).enumerate() {
608                for i in 0..shape[0] {
609                    normalized[[i, j_global]] = batch_normalized[[i, j_local]];
610                }
611            }
612        }
613    } else {
614        // Row-wise: batch by rows
615        let rows_per_batch = max_elements_per_batch / shape[1];
616        for row_start in (0..shape[0]).step_by(rows_per_batch) {
617            let row_end = (row_start + rows_per_batch).min(shape[0]);
618            let batch_view = array.slice(scirs2_core::ndarray::s![row_start..row_end, ..]);
619            let batch_normalized = simd_normalize_adaptive(&batch_view, method, axis)?;
620
621            for (i_local, i_global) in (row_start..row_end).enumerate() {
622                for j in 0..shape[1] {
623                    normalized[[i_global, j]] = batch_normalized[[i_local, j]];
624                }
625            }
626        }
627    }
628
629    Ok(normalized)
630}
631
632/// Chunked SIMD normalization for large datasets
633#[allow(dead_code)]
634fn simd_normalize_chunked<S, F>(
635    array: &ArrayBase<S, Ix2>,
636    method: NormalizationMethod,
637    axis: usize,
638) -> Result<Array2<F>>
639where
640    S: Data<Elem = F>,
641    F: Float + NumCast + SimdUnifiedOps,
642{
643    let shape = array.shape();
644    let mut normalized = Array2::zeros((shape[0], shape[1]));
645
646    let block_sizer = AdaptiveBlockSizer::new::<F>(&[shape[0], shape[1]]);
647    let chunk_size = block_sizer.optimal_block_size * 4; // Larger chunks for big data
648
649    if axis == 0 {
650        // Process in column chunks
651        for chunk_start in (0..shape[1]).step_by(chunk_size) {
652            let chunk_end = (chunk_start + chunk_size).min(shape[1]);
653            let chunk_view = array.slice(scirs2_core::ndarray::s![.., chunk_start..chunk_end]);
654            let chunk_normalized = simd_normalizearray(&chunk_view, method, axis)?;
655
656            for (j_local, j_global) in (chunk_start..chunk_end).enumerate() {
657                for i in 0..shape[0] {
658                    normalized[[i, j_global]] = chunk_normalized[[i, j_local]];
659                }
660            }
661        }
662    } else {
663        // Process in row chunks
664        for chunk_start in (0..shape[0]).step_by(chunk_size) {
665            let chunk_end = (chunk_start + chunk_size).min(shape[0]);
666            let chunk_view = array.slice(scirs2_core::ndarray::s![chunk_start..chunk_end, ..]);
667            let chunk_normalized = simd_normalizearray(&chunk_view, method, axis)?;
668
669            for (i_local, i_global) in (chunk_start..chunk_end).enumerate() {
670                for j in 0..shape[1] {
671                    normalized[[i_global, j]] = chunk_normalized[[i_local, j]];
672                }
673            }
674        }
675    }
676
677    Ok(normalized)
678}
679
680/// Optimized SIMD normalization for tall matrices (many rows, few columns)
681#[allow(dead_code)]
682fn simd_normalize_optimized_tall<S, F>(
683    array: &ArrayBase<S, Ix2>,
684    method: NormalizationMethod,
685    axis: usize,
686) -> Result<Array2<F>>
687where
688    S: Data<Elem = F>,
689    F: Float + NumCast + SimdUnifiedOps,
690{
691    // For tall matrices, optimize memory access patterns
692    if axis == 0 {
693        // Column-wise normalization: pre-compute all statistics
694        simd_normalizearray(array, method, axis)
695    } else {
696        // Row-wise normalization: use smaller blocks for better cache usage
697        let shape = array.shape();
698        let mut normalized = Array2::zeros((shape[0], shape[1]));
699        let small_block_size = 32; // Smaller blocks for tall matrices
700
701        for block_start in (0..shape[0]).step_by(small_block_size) {
702            let block_end = (block_start + small_block_size).min(shape[0]);
703
704            for i in block_start..block_end {
705                let row = array.row(i);
706                let rowarray = row.to_owned();
707                let norm_row = match method {
708                    NormalizationMethod::MinMax => simd_minmax_normalize_1d(&rowarray)?,
709                    NormalizationMethod::ZScore => simd_zscore_normalize_1d(&rowarray)?,
710                    NormalizationMethod::L2 => simd_l2_normalize_1d(&rowarray)?,
711                    NormalizationMethod::MaxAbs => simd_maxabs_normalize_1d(&rowarray)?,
712                    _ => {
713                        return Err(TransformError::InvalidInput(
714                            "Unsupported normalization method for tall matrix optimization"
715                                .to_string(),
716                        ))
717                    }
718                };
719
720                for j in 0..shape[1] {
721                    normalized[[i, j]] = norm_row[j];
722                }
723            }
724        }
725
726        Ok(normalized)
727    }
728}
729
730/// Optimized SIMD normalization for wide matrices (few rows, many columns)
731#[allow(dead_code)]
732fn simd_normalize_optimized_wide<S, F>(
733    array: &ArrayBase<S, Ix2>,
734    method: NormalizationMethod,
735    axis: usize,
736) -> Result<Array2<F>>
737where
738    S: Data<Elem = F>,
739    F: Float + NumCast + SimdUnifiedOps,
740{
741    // For wide matrices, optimize for column-wise operations
742    if axis == 1 {
743        // Row-wise normalization: straightforward processing
744        simd_normalizearray(array, method, axis)
745    } else {
746        // Column-wise normalization: use vectorized column processing
747        let shape = array.shape();
748        let mut normalized = Array2::zeros((shape[0], shape[1]));
749        let wide_block_size = 128; // Larger blocks for wide matrices
750
751        for block_start in (0..shape[1]).step_by(wide_block_size) {
752            let block_end = (block_start + wide_block_size).min(shape[1]);
753
754            for j in block_start..block_end {
755                let col = array.column(j);
756                let colarray = col.to_owned();
757                let norm_col = match method {
758                    NormalizationMethod::MinMax => simd_minmax_normalize_1d(&colarray)?,
759                    NormalizationMethod::ZScore => simd_zscore_normalize_1d(&colarray)?,
760                    NormalizationMethod::L2 => simd_l2_normalize_1d(&colarray)?,
761                    NormalizationMethod::MaxAbs => simd_maxabs_normalize_1d(&colarray)?,
762                    _ => {
763                        return Err(TransformError::InvalidInput(
764                            "Unsupported normalization method for wide matrix optimization"
765                                .to_string(),
766                        ))
767                    }
768                };
769
770                for i in 0..shape[0] {
771                    normalized[[i, j]] = norm_col[i];
772                }
773            }
774        }
775
776        Ok(normalized)
777    }
778}