scirs2_transform/
features.rs

1//! Feature engineering utilities
2//!
3//! This module provides tools for feature engineering, which is the process of
4//! transforming raw data into features that better represent the underlying problem
5//! to predictive models.
6
7use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
8use scirs2_core::numeric::{Float, NumCast};
9use scirs2_core::parallel_ops::*;
10
11use crate::error::{Result, TransformError};
12
13// Define a small value to use for comparison with zero
14const EPSILON: f64 = 1e-10;
15
16/// Polynomial feature transformation
17///
18/// Generates polynomial features up to specified degree
19pub struct PolynomialFeatures {
20    /// The maximum degree of the polynomial features
21    degree: usize,
22    /// Whether to include interaction terms only (no powers)
23    interaction_only: bool,
24    /// Whether to include bias term (constant feature equal to 1)
25    includebias: bool,
26}
27
28impl PolynomialFeatures {
29    /// Creates a new PolynomialFeatures transformer
30    ///
31    /// # Arguments
32    /// * `degree` - The degree of the polynomial features to generate
33    /// * `interaction_only` - If true, only interaction features are produced (no powers)
34    /// * `includebias` - If true, include a bias term (constant feature equal to 1)
35    ///
36    /// # Returns
37    /// * A new PolynomialFeatures instance
38    pub fn new(degree: usize, interaction_only: bool, includebias: bool) -> Self {
39        PolynomialFeatures {
40            degree,
41            interaction_only,
42            includebias,
43        }
44    }
45
46    /// Calculates the number of output features that will be generated
47    ///
48    /// # Arguments
49    /// * `nfeatures` - The number of input features
50    ///
51    /// # Returns
52    /// * The number of features that will be generated
53    pub fn n_output_features(&self, nfeatures: usize) -> usize {
54        if nfeatures == 0 {
55            return 0;
56        }
57
58        if self.interaction_only {
59            let mut n = if self.includebias { 1 } else { 0 };
60            for d in 1..=self.degree.min(nfeatures) {
61                // Binomial coefficient (nfeatures, d)
62                let mut b = 1;
63                for i in 0..d {
64                    b = b * (nfeatures - i) / (i + 1);
65                }
66                n += b;
67            }
68            n
69        } else {
70            // Number of polynomial _features is equivalent to the number of terms
71            // in a polynomial of degree `degree` in `nfeatures` variables
72            let n = if self.includebias { 1 } else { 0 };
73            n + (0..=self.degree)
74                .skip(if self.includebias { 1 } else { 0 })
75                .map(|d| {
76                    // Binomial coefficient (nfeatures + d - 1, d)
77                    let mut b = 1;
78                    for i in 0..d {
79                        b = b * (nfeatures + d - 1 - i) / (i + 1);
80                    }
81                    b
82                })
83                .sum::<usize>()
84        }
85    }
86
87    /// Transforms the input data into polynomial features
88    ///
89    /// # Arguments
90    /// * `array` - The input array to transform
91    ///
92    /// # Returns
93    /// * `Result<Array2<f64>>` - The transformed array with polynomial features
94    pub fn transform<S>(&self, array: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
95    where
96        S: Data,
97        S::Elem: Float + NumCast,
98    {
99        // Enhanced input validation
100        if array.is_empty() {
101            return Err(TransformError::InvalidInput(
102                "Input array cannot be empty".to_string(),
103            ));
104        }
105
106        let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
107
108        // Validate for NaN or infinite values
109        for &val in array_f64.iter() {
110            if !val.is_finite() {
111                return Err(TransformError::DataValidationError(
112                    "Input array contains non-finite values (NaN or infinity)".to_string(),
113                ));
114            }
115        }
116
117        if !array_f64.is_standard_layout() {
118            return Err(TransformError::InvalidInput(
119                "Input array must be in standard memory layout".to_string(),
120            ));
121        }
122
123        let shape = array_f64.shape();
124        let n_samples = shape[0];
125        let nfeatures = shape[1];
126
127        // Additional validation
128        if self.degree == 0 {
129            return Err(TransformError::InvalidInput(
130                "Degree must be at least 1".to_string(),
131            ));
132        }
133
134        if self.degree > 10 {
135            return Err(TransformError::InvalidInput(
136                "Degree > 10 may cause numerical overflow. Please use a smaller degree."
137                    .to_string(),
138            ));
139        }
140
141        // Check for potential overflow before computing output features
142        let n_output_features = self.n_output_features(nfeatures);
143
144        // Prevent excessive memory usage
145        if n_output_features > 100_000 {
146            return Err(TransformError::MemoryError(format!(
147                "Output would have {n_output_features} features, which exceeds the limit of 100,000. Consider reducing degree or using interaction_only=true"
148            )));
149        }
150
151        // Check for potential numerical overflow based on input data magnitude
152        let max_abs_value = array_f64.iter().map(|&x| x.abs()).fold(0.0, f64::max);
153        if max_abs_value > 100.0 && self.degree > 3 {
154            return Err(TransformError::DataValidationError(format!(
155                "Large input values (max: {max_abs_value:.2}) with high degree ({}) may cause numerical overflow. Consider scaling input data first.",
156                self.degree
157            )));
158        }
159
160        let mut result = Array2::zeros((n_samples, n_output_features));
161
162        // Generate combinations for each degree
163        let mut powers = vec![0; nfeatures];
164        let mut col_idx = 0;
165
166        // Add bias term (constant feature equal to 1)
167        if self.includebias {
168            for i in 0..n_samples {
169                result[[i, col_idx]] = 1.0;
170            }
171            col_idx += 1;
172        }
173
174        // Add individual features
175        // Add individual features
176        for i in 0..n_samples {
177            for j in 0..nfeatures {
178                result[[i, col_idx + j]] = array_f64[[i, j]];
179            }
180        }
181        col_idx += nfeatures;
182
183        // Generate higher-degree features
184        if self.degree >= 2 {
185            // Function to recursively generate combinations
186            #[allow(clippy::too_many_arguments)]
187            fn generate_combinations(
188                powers: &mut Vec<usize>,
189                start: usize,
190                degree: usize,
191                max_degree: usize,
192                interaction_only: bool,
193                array: &Array2<f64>,
194                result: &mut Array2<f64>,
195                col_idx: &mut usize,
196            ) -> Result<()> {
197                if degree == 0 {
198                    // Skip the bias term and individual features
199                    let sum: usize = powers.iter().sum();
200                    if sum >= 2 && sum <= max_degree {
201                        // Compute the feature values with overflow detection
202                        for i in 0..array.shape()[0] {
203                            let mut val = 1.0;
204                            for (j, &p) in powers.iter().enumerate() {
205                                if p > 0 {
206                                    let base = array[[i, j]];
207                                    let powered = base.powi(p as i32);
208
209                                    // Check for overflow/underflow
210                                    if !powered.is_finite() {
211                                        return Err(TransformError::ComputationError(format!(
212                                            "Numerical overflow detected when computing {base}^{p} at sample {i}, feature {j}"
213                                        )));
214                                    }
215
216                                    val *= powered;
217
218                                    // Additional overflow check after multiplication
219                                    if !val.is_finite() {
220                                        return Err(TransformError::ComputationError(format!(
221                                            "Numerical overflow detected during polynomial feature computation at sample {i}"
222                                        )));
223                                    }
224                                }
225                            }
226                            result[[i, *col_idx]] = val;
227                        }
228                        *col_idx += 1;
229                    }
230                    return Ok(());
231                }
232
233                for j in start..powers.len() {
234                    // When interaction_only=true, _only consider powers of 0 or 1
235                    let max_power = if interaction_only { 1 } else { degree };
236                    for p in 1..=max_power {
237                        powers[j] += p;
238                        generate_combinations(
239                            powers,
240                            j + 1,
241                            degree - p,
242                            max_degree,
243                            interaction_only,
244                            array,
245                            result,
246                            col_idx,
247                        )?;
248                        powers[j] -= p;
249                    }
250                }
251                Ok(())
252            }
253
254            // Start from degree 2 features
255            let mut current_col_idx = col_idx;
256            generate_combinations(
257                &mut powers,
258                0,
259                self.degree,
260                self.degree,
261                self.interaction_only,
262                &array_f64,
263                &mut result,
264                &mut current_col_idx,
265            )?;
266        }
267
268        // Final validation of the output
269        for &val in result.iter() {
270            if !val.is_finite() {
271                return Err(TransformError::ComputationError(
272                    "Output contains non-finite values. This may be due to numerical overflow."
273                        .to_string(),
274                ));
275            }
276        }
277
278        // Check for extremely large values that might cause issues downstream
279        let max_output_value = result.iter().map(|&x| x.abs()).fold(0.0, f64::max);
280        if max_output_value > 1e15 {
281            return Err(TransformError::DataValidationError(format!(
282                "Output contains extremely large values (max: {max_output_value:.2e}). Consider scaling input data or reducing polynomial degree."
283            )));
284        }
285
286        Ok(result)
287    }
288}
289
290/// Binarizes data (sets feature values to 0 or 1) according to a threshold
291///
292/// # Arguments
293/// * `array` - The input array to binarize
294/// * `threshold` - The threshold used to binarize. Values above the threshold map to 1, others to 0.
295///
296/// # Returns
297/// * `Result<Array2<f64>>` - The binarized array
298///
299/// # Examples
300/// ```
301/// use scirs2_core::ndarray::array;
302/// use scirs2_transform::features::binarize;
303///
304/// let data = array![[1.0, -1.0, 2.0],
305///                   [2.0, 0.0, 0.0],
306///                   [-1.0, 1.0, -1.0]];
307///                   
308/// let binarized = binarize(&data, 0.0).unwrap();
309/// ```
310#[allow(dead_code)]
311pub fn binarize<S>(array: &ArrayBase<S, Ix2>, threshold: f64) -> Result<Array2<f64>>
312where
313    S: Data,
314    S::Elem: Float + NumCast,
315{
316    // Enhanced input validation
317    if array.is_empty() {
318        return Err(TransformError::InvalidInput(
319            "Input _array cannot be empty".to_string(),
320        ));
321    }
322
323    if !threshold.is_finite() {
324        return Err(TransformError::InvalidInput(
325            "Threshold must be a finite number".to_string(),
326        ));
327    }
328
329    let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
330
331    // Validate for NaN or infinite values
332    for &val in array_f64.iter() {
333        if !val.is_finite() {
334            return Err(TransformError::DataValidationError(
335                "Input _array contains non-finite values (NaN or infinity)".to_string(),
336            ));
337        }
338    }
339
340    if !array_f64.is_standard_layout() {
341        return Err(TransformError::InvalidInput(
342            "Input _array must be in standard memory layout".to_string(),
343        ));
344    }
345
346    let shape = array_f64.shape();
347    let mut binarized = Array2::zeros((shape[0], shape[1]));
348
349    for i in 0..shape[0] {
350        for j in 0..shape[1] {
351            binarized[[i, j]] = if array_f64[[i, j]] > threshold {
352                1.0
353            } else {
354                0.0
355            };
356        }
357    }
358
359    Ok(binarized)
360}
361
362/// Computes quantiles for a given array along an axis
363///
364/// # Arguments
365/// * `array` - The input array
366/// * `n_quantiles` - Number of quantiles to compute
367/// * `axis` - The axis along which to compute quantiles (0 for columns, 1 for rows)
368///
369/// # Returns
370/// * `Result<Array2<f64>>` - Array of quantiles with shape (nfeatures, n_quantiles) if axis=0,
371///   or (n_samples, n_quantiles) if axis=1
372#[allow(dead_code)]
373fn compute_quantiles<S>(
374    array: &ArrayBase<S, Ix2>,
375    n_quantiles: usize,
376    axis: usize,
377) -> Result<Array2<f64>>
378where
379    S: Data,
380    S::Elem: Float + NumCast,
381{
382    let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
383
384    if n_quantiles < 2 {
385        return Err(TransformError::InvalidInput(
386            "n_quantiles must be at least 2".to_string(),
387        ));
388    }
389
390    if axis >= 2 {
391        return Err(TransformError::InvalidInput(
392            "axis must be 0 or 1".to_string(),
393        ));
394    }
395
396    let shape = array_f64.shape();
397    let nfeatures = if axis == 0 { shape[1] } else { shape[0] };
398
399    let mut quantiles = Array2::zeros((nfeatures, n_quantiles));
400
401    for i in 0..nfeatures {
402        // Extract the data along the given axis
403        let data: Vec<f64> = if axis == 0 {
404            array_f64.column(i).to_vec()
405        } else {
406            array_f64.row(i).to_vec()
407        };
408
409        let mut sorted_data = data.clone();
410        sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
411
412        // Compute _quantiles
413        for j in 0..n_quantiles {
414            let q = j as f64 / (n_quantiles - 1) as f64;
415            let idx = (q * (sorted_data.len() - 1) as f64).round() as usize;
416            quantiles[[i, j]] = sorted_data[idx];
417        }
418    }
419
420    Ok(quantiles)
421}
422
423/// Discretizes features using equal-width bins
424///
425/// # Arguments
426/// * `array` - The input array to discretize
427/// * `n_bins` - The number of bins to create
428/// * `encode` - The encoding method ('onehot' or 'ordinal')
429/// * `axis` - The axis along which to discretize (0 for columns, 1 for rows)
430///
431/// # Returns
432/// * `Result<Array2<f64>>` - The discretized array
433#[allow(dead_code)]
434pub fn discretize_equal_width<S>(
435    array: &ArrayBase<S, Ix2>,
436    n_bins: usize,
437    encode: &str,
438    axis: usize,
439) -> Result<Array2<f64>>
440where
441    S: Data,
442    S::Elem: Float + NumCast,
443{
444    let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
445
446    if !array_f64.is_standard_layout() {
447        return Err(TransformError::InvalidInput(
448            "Input array must be in standard memory layout".to_string(),
449        ));
450    }
451
452    if n_bins < 2 {
453        return Err(TransformError::InvalidInput(
454            "n_bins must be at least 2".to_string(),
455        ));
456    }
457
458    if encode != "onehot" && encode != "ordinal" {
459        return Err(TransformError::InvalidInput(
460            "encode must be 'onehot' or 'ordinal'".to_string(),
461        ));
462    }
463
464    if axis >= 2 {
465        return Err(TransformError::InvalidInput(
466            "axis must be 0 or 1".to_string(),
467        ));
468    }
469
470    let shape = array_f64.shape();
471    let n_samples = shape[0];
472    let nfeatures = shape[1];
473
474    let n_output_features = if encode == "onehot" {
475        if axis == 0 {
476            nfeatures * n_bins
477        } else {
478            n_samples * n_bins
479        }
480    } else if axis == 0 {
481        nfeatures
482    } else {
483        n_samples
484    };
485
486    let mut min_values = Array1::zeros(if axis == 0 { nfeatures } else { n_samples });
487    let mut max_values = Array1::zeros(if axis == 0 { nfeatures } else { n_samples });
488
489    // Compute min and max values along the specified axis
490    for i in 0..(if axis == 0 { nfeatures } else { n_samples }) {
491        let data = if axis == 0 {
492            array_f64.column(i)
493        } else {
494            array_f64.row(i)
495        };
496
497        min_values[i] = data.fold(f64::INFINITY, |acc, &x| acc.min(x));
498        max_values[i] = data.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x));
499    }
500
501    // Create the bin edges
502    let mut bin_edges = Array2::zeros((if axis == 0 { nfeatures } else { n_samples }, n_bins + 1));
503    for i in 0..(if axis == 0 { nfeatures } else { n_samples }) {
504        let min_val = min_values[i];
505        let max_val = max_values[i];
506        let bin_width = if (max_val - min_val).abs() < EPSILON {
507            // Handle the case where all values are the same
508            1.0
509        } else {
510            (max_val - min_val) / n_bins as f64
511        };
512
513        for j in 0..=n_bins {
514            bin_edges[[i, j]] = min_val + bin_width * j as f64;
515        }
516
517        // Ensure the last bin edge includes the maximum value
518        bin_edges[[i, n_bins]] = bin_edges[[i, n_bins]].max(max_val + EPSILON);
519    }
520
521    // Discretize the data
522    let mut discretized = Array2::zeros((n_samples, n_output_features));
523
524    if encode == "ordinal" {
525        // Ordinal encoding: assign each value to its bin index (0 to n_bins - 1)
526        for i in 0..n_samples {
527            for j in 0..nfeatures {
528                let value = array_f64[[i, j]];
529                let feature_idx = if axis == 0 { j } else { i };
530
531                // Find the bin index
532                let mut bin_idx = 0;
533                while bin_idx < n_bins && value > bin_edges[[feature_idx, bin_idx + 1]] {
534                    bin_idx += 1;
535                }
536
537                let output_idx = if axis == 0 { j } else { i };
538                discretized[[i, output_idx]] = bin_idx as f64;
539            }
540        }
541    } else {
542        // One-hot encoding: create a binary feature for each bin
543        for i in 0..n_samples {
544            for j in 0..nfeatures {
545                let value = array_f64[[i, j]];
546                let feature_idx = if axis == 0 { j } else { i };
547
548                // Find the bin index
549                let mut bin_idx = 0;
550                while bin_idx < n_bins && value > bin_edges[[feature_idx, bin_idx + 1]] {
551                    bin_idx += 1;
552                }
553
554                let output_idx = if axis == 0 {
555                    j * n_bins + bin_idx
556                } else {
557                    i * n_bins + bin_idx
558                };
559
560                discretized[[i, output_idx]] = 1.0;
561            }
562        }
563    }
564
565    Ok(discretized)
566}
567
568/// Discretizes features using equal-frequency bins (quantiles)
569///
570/// # Arguments
571/// * `array` - The input array to discretize
572/// * `n_bins` - The number of bins to create
573/// * `encode` - The encoding method ('onehot' or 'ordinal')
574/// * `axis` - The axis along which to discretize (0 for columns, 1 for rows)
575///
576/// # Returns
577/// * `Result<Array2<f64>>` - The discretized array
578#[allow(dead_code)]
579pub fn discretize_equal_frequency<S>(
580    array: &ArrayBase<S, Ix2>,
581    n_bins: usize,
582    encode: &str,
583    axis: usize,
584) -> Result<Array2<f64>>
585where
586    S: Data,
587    S::Elem: Float + NumCast,
588{
589    let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
590
591    if !array_f64.is_standard_layout() {
592        return Err(TransformError::InvalidInput(
593            "Input array must be in standard memory layout".to_string(),
594        ));
595    }
596
597    if n_bins < 2 {
598        return Err(TransformError::InvalidInput(
599            "n_bins must be at least 2".to_string(),
600        ));
601    }
602
603    if encode != "onehot" && encode != "ordinal" {
604        return Err(TransformError::InvalidInput(
605            "encode must be 'onehot' or 'ordinal'".to_string(),
606        ));
607    }
608
609    if axis >= 2 {
610        return Err(TransformError::InvalidInput(
611            "axis must be 0 or 1".to_string(),
612        ));
613    }
614
615    let shape = array_f64.shape();
616    let n_samples = shape[0];
617    let nfeatures = shape[1];
618
619    let n_output_features = if encode == "onehot" {
620        if axis == 0 {
621            nfeatures * n_bins
622        } else {
623            n_samples * n_bins
624        }
625    } else if axis == 0 {
626        nfeatures
627    } else {
628        n_samples
629    };
630
631    // Compute quantiles
632    let quantiles = compute_quantiles(&array_f64, n_bins + 1, axis)?;
633
634    // Discretize the data
635    let mut discretized = Array2::zeros((n_samples, n_output_features));
636
637    if encode == "ordinal" {
638        // Ordinal encoding: assign each value to its bin index (0 to n_bins - 1)
639        for i in 0..n_samples {
640            for j in 0..nfeatures {
641                let value = array_f64[[i, j]];
642                let feature_idx = if axis == 0 { j } else { i };
643
644                // Find the bin index
645                let mut bin_idx = 0;
646                while bin_idx < n_bins && value > quantiles[[feature_idx, bin_idx + 1]] {
647                    bin_idx += 1;
648                }
649
650                let output_idx = if axis == 0 { j } else { i };
651                discretized[[i, output_idx]] = bin_idx as f64;
652            }
653        }
654    } else {
655        // One-hot encoding: create a binary feature for each bin
656        for i in 0..n_samples {
657            for j in 0..nfeatures {
658                let value = array_f64[[i, j]];
659                let feature_idx = if axis == 0 { j } else { i };
660
661                // Find the bin index
662                let mut bin_idx = 0;
663                while bin_idx < n_bins && value > quantiles[[feature_idx, bin_idx + 1]] {
664                    bin_idx += 1;
665                }
666
667                let output_idx = if axis == 0 {
668                    j * n_bins + bin_idx
669                } else {
670                    i * n_bins + bin_idx
671                };
672
673                discretized[[i, output_idx]] = 1.0;
674            }
675        }
676    }
677
678    Ok(discretized)
679}
680
681/// Applies various power transformations to make data more Gaussian-like
682///
683/// # Arguments
684/// * `array` - The input array to transform
685/// * `method` - The transformation method ('yeo-johnson' or 'box-cox')
686/// * `standardize` - Whether to standardize the output to have zero mean and unit variance
687///
688/// # Returns
689/// * `Result<Array2<f64>>` - The transformed array
690///
691/// # Examples
692///
693/// ```
694/// use scirs2_core::ndarray::array;
695/// use scirs2_transform::features::power_transform;
696///
697/// let data = array![[1.0, 2.0, 3.0],
698///                   [4.0, 5.0, 6.0],
699///                   [7.0, 8.0, 9.0]];
700///                   
701/// let transformed = power_transform(&data, "yeo-johnson", true).unwrap();
702/// ```
703#[allow(dead_code)]
704pub fn power_transform<S>(
705    array: &ArrayBase<S, Ix2>,
706    method: &str,
707    standardize: bool,
708) -> Result<Array2<f64>>
709where
710    S: Data,
711    S::Elem: Float + NumCast,
712{
713    let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
714
715    if !array_f64.is_standard_layout() {
716        return Err(TransformError::InvalidInput(
717            "Input array must be in standard memory layout".to_string(),
718        ));
719    }
720
721    if method != "box-cox" && method != "yeo-johnson" {
722        return Err(TransformError::InvalidInput(format!(
723            "Unknown method: {method}. Supported methods are 'box-cox' and 'yeo-johnson'"
724        )));
725    }
726
727    let shape = array_f64.shape();
728    let n_samples = shape[0];
729    let nfeatures = shape[1];
730
731    let mut transformed = Array2::zeros((n_samples, nfeatures));
732
733    // For each feature, find the optimal lambda and apply the transformation
734    for j in 0..nfeatures {
735        let feature = array_f64.column(j).to_vec();
736
737        // Maximum likelihood estimation of lambda
738        let lambda = estimate_optimal_lambda(&feature, method)?;
739
740        // Apply the transformation
741        for i in 0..n_samples {
742            let x = array_f64[[i, j]];
743
744            let transformed_value = match method {
745                "box-cox" => {
746                    if lambda.abs() < 1e-10 {
747                        x.ln()
748                    } else {
749                        (x.powf(lambda) - 1.0) / lambda
750                    }
751                }
752                "yeo-johnson" => {
753                    if x >= 0.0 {
754                        if lambda.abs() < 1e-10 {
755                            x.ln_1p()
756                        } else {
757                            ((1.0 + x).powf(lambda) - 1.0) / lambda
758                        }
759                    } else if (2.0 - lambda).abs() < 1e-10 {
760                        -((-x + 1.0).ln())
761                    } else {
762                        -((-x + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda)
763                    }
764                }
765                _ => unreachable!(), // Already validated above
766            };
767
768            transformed[[i, j]] = transformed_value;
769        }
770    }
771
772    // Optionally standardize the transformed data
773    if standardize {
774        for j in 0..nfeatures {
775            let column_data: Vec<f64> = transformed.column(j).to_vec();
776            let mean = column_data.iter().sum::<f64>() / column_data.len() as f64;
777            let variance = column_data.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
778                / column_data.len() as f64;
779            let std_dev = variance.sqrt();
780
781            if std_dev > 1e-10 {
782                for i in 0..n_samples {
783                    transformed[[i, j]] = (transformed[[i, j]] - mean) / std_dev;
784                }
785            }
786        }
787    }
788
789    Ok(transformed)
790}
791
792/// Estimate optimal lambda parameter using maximum likelihood estimation
793#[allow(dead_code)]
794fn estimate_optimal_lambda(data: &[f64], method: &str) -> Result<f64> {
795    if data.is_empty() {
796        return Err(TransformError::InvalidInput(
797            "Empty data for lambda estimation".to_string(),
798        ));
799    }
800
801    // Check data constraints based on transformation method
802    if method == "box-cox" {
803        // Box-Cox requires all positive values
804        if data.iter().any(|&x| x <= 0.0) {
805            return Err(TransformError::InvalidInput(
806                "Box-Cox transformation requires all positive values".to_string(),
807            ));
808        }
809    }
810
811    // Define search range for lambda (same for both box-cox and yeo-johnson)
812    let lambda_range = vec![-2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0];
813
814    let mut best_lambda = 0.0;
815    let mut best_log_likelihood = f64::NEG_INFINITY;
816
817    // Grid search for optimal lambda
818    for &lambda in &lambda_range {
819        let log_likelihood = compute_log_likelihood(data, lambda, method)?;
820
821        if log_likelihood > best_log_likelihood {
822            best_log_likelihood = log_likelihood;
823            best_lambda = lambda;
824        }
825    }
826
827    // Refine with golden section search around the best point
828    let tolerance = 1e-6;
829    let refined_lambda = golden_section_search(
830        data,
831        method,
832        best_lambda - 0.5,
833        best_lambda + 0.5,
834        tolerance,
835    )?;
836
837    Ok(refined_lambda)
838}
839
840/// Compute log-likelihood for given lambda parameter
841#[allow(dead_code)]
842fn compute_log_likelihood(data: &[f64], lambda: f64, method: &str) -> Result<f64> {
843    let n = data.len() as f64;
844    let mut transformed_data = Vec::with_capacity(data.len());
845    let mut jacobian_sum = 0.0;
846
847    // Apply transformation and compute Jacobian
848    for &x in data {
849        let (y, jacobian) = match method {
850            "box-cox" => {
851                if x <= 0.0 {
852                    return Ok(f64::NEG_INFINITY); // Invalid for Box-Cox
853                }
854                let y = if lambda.abs() < 1e-10 {
855                    x.ln()
856                } else {
857                    (x.powf(lambda) - 1.0) / lambda
858                };
859                let jacobian = x.ln();
860                (y, jacobian)
861            }
862            "yeo-johnson" => {
863                let (y, jacobian) = if x >= 0.0 {
864                    if lambda.abs() < 1e-10 {
865                        (x.ln_1p(), (1.0 + x).ln())
866                    } else {
867                        let y = ((1.0 + x).powf(lambda) - 1.0) / lambda;
868                        let jacobian = (1.0 + x).ln();
869                        (y, jacobian)
870                    }
871                } else if (2.0 - lambda).abs() < 1e-10 {
872                    (-((-x + 1.0).ln()), -(1.0 - x).ln())
873                } else {
874                    let y = -((-x + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda);
875                    let jacobian = -(1.0 - x).ln();
876                    (y, jacobian)
877                };
878                (y, jacobian)
879            }
880            _ => {
881                return Err(TransformError::InvalidInput(format!(
882                    "Unknown method: {method}"
883                )))
884            }
885        };
886
887        if !y.is_finite() {
888            return Ok(f64::NEG_INFINITY);
889        }
890
891        transformed_data.push(y);
892        jacobian_sum += jacobian;
893    }
894
895    // Compute sample variance of transformed data
896    let mean = transformed_data.iter().sum::<f64>() / n;
897    let variance = transformed_data
898        .iter()
899        .map(|y| (y - mean).powi(2))
900        .sum::<f64>()
901        / n;
902
903    if variance <= 0.0 {
904        return Ok(f64::NEG_INFINITY);
905    }
906
907    // Log-likelihood calculation
908    let log_likelihood =
909        -0.5 * n * (2.0 * std::f64::consts::PI).ln() - 0.5 * n * variance.ln() - 0.5 * n
910            + (lambda - 1.0) * jacobian_sum;
911
912    Ok(log_likelihood)
913}
914
915/// Golden section search for optimal lambda
916#[allow(dead_code)]
917fn golden_section_search(
918    data: &[f64],
919    method: &str,
920    mut a: f64,
921    mut b: f64,
922    tolerance: f64,
923) -> Result<f64> {
924    let phi = (1.0 + 5.0_f64.sqrt()) / 2.0; // Golden ratio
925    let resphi = 2.0 - phi;
926
927    // Initial points
928    let mut x1 = a + resphi * (b - a);
929    let mut x2 = a + (1.0 - resphi) * (b - a);
930
931    let mut f1 = compute_log_likelihood(data, x1, method)?;
932    let mut f2 = compute_log_likelihood(data, x2, method)?;
933
934    for _ in 0..100 {
935        // Maximum iterations
936        if (b - a).abs() < tolerance {
937            break;
938        }
939
940        if f1 > f2 {
941            b = x2;
942            x2 = x1;
943            f2 = f1;
944            x1 = a + resphi * (b - a);
945            f1 = compute_log_likelihood(data, x1, method)?;
946        } else {
947            a = x1;
948            x1 = x2;
949            f1 = f2;
950            x2 = a + (1.0 - resphi) * (b - a);
951            f2 = compute_log_likelihood(data, x2, method)?;
952        }
953    }
954
955    Ok((a + b) / 2.0)
956}
957
958/// Apply Yeo-Johnson transformation to a single value
959#[allow(dead_code)]
960fn yeo_johnson_transform(x: f64, lambda: f64) -> f64 {
961    if x >= 0.0 {
962        if (lambda - 0.0).abs() < EPSILON {
963            (x + 1.0).ln()
964        } else {
965            ((x + 1.0).powf(lambda) - 1.0) / lambda
966        }
967    } else if (lambda - 2.0).abs() < EPSILON {
968        -((-x + 1.0).ln())
969    } else {
970        -(((-x + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda))
971    }
972}
973
974/// Apply Box-Cox transformation to a single value
975#[allow(dead_code)]
976fn box_cox_transform(x: f64, lambda: f64) -> f64 {
977    if (lambda - 0.0).abs() < EPSILON {
978        x.ln()
979    } else {
980        (x.powf(lambda) - 1.0) / lambda
981    }
982}
983
984/// Optimized PowerTransformer for making data more Gaussian-like
985///
986/// This is an enhanced version of the power transformation that includes:
987/// - Optimal lambda parameter estimation using maximum likelihood
988/// - Vectorized operations for better performance
989/// - Parallel processing for multiple features
990/// - Fit/transform API for reusable transformations
991/// - Inverse transformation capability
992/// - Enhanced numerical stability
993#[derive(Debug, Clone)]
994pub struct PowerTransformer {
995    /// Transformation method ('yeo-johnson' or 'box-cox')
996    method: String,
997    /// Whether to standardize the output
998    standardize: bool,
999    /// Optimal lambda parameters for each feature (computed during fit)
1000    lambdas_: Option<Array1<f64>>,
1001    /// Means for standardization (computed during fit)
1002    means_: Option<Array1<f64>>,
1003    /// Standard deviations for standardization (computed during fit)
1004    stds_: Option<Array1<f64>>,
1005    /// Whether the transformer has been fitted
1006    is_fitted: bool,
1007}
1008
1009impl PowerTransformer {
1010    /// Creates a new PowerTransformer
1011    ///
1012    /// # Arguments
1013    /// * `method` - The transformation method ('yeo-johnson' or 'box-cox')
1014    /// * `standardize` - Whether to standardize the output to have zero mean and unit variance
1015    ///
1016    /// # Returns
1017    /// * A new PowerTransformer instance
1018    pub fn new(method: &str, standardize: bool) -> Result<Self> {
1019        if method != "yeo-johnson" && method != "box-cox" {
1020            return Err(TransformError::InvalidInput(
1021                "method must be 'yeo-johnson' or 'box-cox'".to_string(),
1022            ));
1023        }
1024
1025        Ok(PowerTransformer {
1026            method: method.to_string(),
1027            standardize,
1028            lambdas_: None,
1029            means_: None,
1030            stds_: None,
1031            is_fitted: false,
1032        })
1033    }
1034
1035    /// Creates a new PowerTransformer with Yeo-Johnson method
1036    #[allow(dead_code)]
1037    pub fn yeo_johnson(standardize: bool) -> Self {
1038        PowerTransformer {
1039            method: "yeo-johnson".to_string(),
1040            standardize,
1041            lambdas_: None,
1042            means_: None,
1043            stds_: None,
1044            is_fitted: false,
1045        }
1046    }
1047
1048    /// Creates a new PowerTransformer with Box-Cox method
1049    #[allow(dead_code)]
1050    pub fn box_cox(standardize: bool) -> Self {
1051        PowerTransformer {
1052            method: "box-cox".to_string(),
1053            standardize,
1054            lambdas_: None,
1055            means_: None,
1056            stds_: None,
1057            is_fitted: false,
1058        }
1059    }
1060
1061    /// Fits the PowerTransformer to the input data
1062    ///
1063    /// This computes the optimal lambda parameters for each feature using maximum likelihood estimation
1064    ///
1065    /// # Arguments
1066    /// * `x` - The input data, shape (n_samples, nfeatures)
1067    ///
1068    /// # Returns
1069    /// * `Result<()>` - Ok if successful, Err otherwise
1070    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
1071    where
1072        S: Data,
1073        S::Elem: Float + NumCast,
1074    {
1075        let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
1076
1077        if !x_f64.is_standard_layout() {
1078            return Err(TransformError::InvalidInput(
1079                "Input array must be in standard memory layout".to_string(),
1080            ));
1081        }
1082
1083        let shape = x_f64.shape();
1084        let n_samples = shape[0];
1085        let nfeatures = shape[1];
1086
1087        if n_samples == 0 || nfeatures == 0 {
1088            return Err(TransformError::InvalidInput("Empty input data".to_string()));
1089        }
1090
1091        if self.method == "box-cox" {
1092            // Box-Cox requires strictly positive data
1093            if x_f64.iter().any(|&x| x <= 0.0) {
1094                return Err(TransformError::InvalidInput(
1095                    "Box-Cox transformation requires strictly positive data".to_string(),
1096                ));
1097            }
1098        }
1099
1100        // Compute optimal lambda for each feature in parallel
1101        let lambdas: Vec<f64> = (0..nfeatures)
1102            .into_par_iter()
1103            .map(|j| {
1104                let feature = x_f64.column(j).to_vec();
1105                self.optimize_lambda(&feature)
1106            })
1107            .collect();
1108
1109        self.lambdas_ = Some(Array1::from_vec(lambdas));
1110
1111        // If standardization is requested, we need to compute means and stds after transformation
1112        if self.standardize {
1113            let mut means = Array1::zeros(nfeatures);
1114            let mut stds = Array1::zeros(nfeatures);
1115
1116            // Transform data with optimal lambdas and compute statistics
1117            for j in 0..nfeatures {
1118                let lambda = self.lambdas_.as_ref().unwrap()[j];
1119                let mut transformed_feature = Array1::zeros(n_samples);
1120
1121                // Apply transformation to each sample in the feature
1122                for i in 0..n_samples {
1123                    let x = x_f64[[i, j]];
1124                    transformed_feature[i] = if self.method == "yeo-johnson" {
1125                        yeo_johnson_transform(x, lambda)
1126                    } else {
1127                        box_cox_transform(x, lambda)
1128                    };
1129                }
1130
1131                // Compute mean and standard deviation
1132                let mean = transformed_feature.sum() / n_samples as f64;
1133                let variance = transformed_feature
1134                    .iter()
1135                    .map(|&x| (x - mean).powi(2))
1136                    .sum::<f64>()
1137                    / n_samples as f64;
1138                let std_dev = variance.sqrt();
1139
1140                means[j] = mean;
1141                stds[j] = if std_dev > EPSILON { std_dev } else { 1.0 };
1142            }
1143
1144            self.means_ = Some(means);
1145            self.stds_ = Some(stds);
1146        }
1147
1148        self.is_fitted = true;
1149        Ok(())
1150    }
1151
1152    /// Transforms the input data using the fitted parameters
1153    ///
1154    /// # Arguments
1155    /// * `x` - The input data to transform
1156    ///
1157    /// # Returns
1158    /// * `Result<Array2<f64>>` - The transformed data
1159    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1160    where
1161        S: Data,
1162        S::Elem: Float + NumCast,
1163    {
1164        if !self.is_fitted {
1165            return Err(TransformError::InvalidInput(
1166                "PowerTransformer must be fitted before transform".to_string(),
1167            ));
1168        }
1169
1170        let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
1171
1172        if !x_f64.is_standard_layout() {
1173            return Err(TransformError::InvalidInput(
1174                "Input array must be in standard memory layout".to_string(),
1175            ));
1176        }
1177
1178        let shape = x_f64.shape();
1179        let n_samples = shape[0];
1180        let nfeatures = shape[1];
1181
1182        let lambdas = self.lambdas_.as_ref().unwrap();
1183
1184        if nfeatures != lambdas.len() {
1185            return Err(TransformError::InvalidInput(
1186                "Number of features in transform data does not match fitted data".to_string(),
1187            ));
1188        }
1189
1190        let mut transformed = Array2::zeros((n_samples, nfeatures));
1191
1192        // Apply transformation in parallel for each feature
1193        let transformed_data: Vec<Vec<f64>> = (0..nfeatures)
1194            .into_par_iter()
1195            .map(|j| {
1196                let lambda = lambdas[j];
1197                // Transform all samples for this feature
1198                (0..n_samples)
1199                    .map(|i| {
1200                        let x = x_f64[[i, j]];
1201                        if self.method == "yeo-johnson" {
1202                            yeo_johnson_transform(x, lambda)
1203                        } else {
1204                            box_cox_transform(x, lambda)
1205                        }
1206                    })
1207                    .collect()
1208            })
1209            .collect();
1210
1211        // Copy results back to the array
1212        for j in 0..nfeatures {
1213            for i in 0..n_samples {
1214                transformed[[i, j]] = transformed_data[j][i];
1215            }
1216        }
1217
1218        // Apply standardization if requested
1219        if self.standardize {
1220            let means = self.means_.as_ref().unwrap();
1221            let stds = self.stds_.as_ref().unwrap();
1222
1223            for j in 0..nfeatures {
1224                let mean = means[j];
1225                let std = stds[j];
1226
1227                for i in 0..n_samples {
1228                    transformed[[i, j]] = (transformed[[i, j]] - mean) / std;
1229                }
1230            }
1231        }
1232
1233        Ok(transformed)
1234    }
1235
1236    /// Fits the transformer and transforms the data in one step
1237    ///
1238    /// # Arguments
1239    /// * `x` - The input data to fit and transform
1240    ///
1241    /// # Returns
1242    /// * `Result<Array2<f64>>` - The transformed data
1243    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1244    where
1245        S: Data,
1246        S::Elem: Float + NumCast,
1247    {
1248        self.fit(x)?;
1249        self.transform(x)
1250    }
1251
1252    /// Applies the inverse transformation to recover the original data
1253    ///
1254    /// # Arguments
1255    /// * `x` - The transformed data to inverse transform
1256    ///
1257    /// # Returns
1258    /// * `Result<Array2<f64>>` - The inverse transformed data
1259    pub fn inverse_transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1260    where
1261        S: Data,
1262        S::Elem: Float + NumCast,
1263    {
1264        if !self.is_fitted {
1265            return Err(TransformError::InvalidInput(
1266                "PowerTransformer must be fitted before inverse_transform".to_string(),
1267            ));
1268        }
1269
1270        let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
1271
1272        if !x_f64.is_standard_layout() {
1273            return Err(TransformError::InvalidInput(
1274                "Input array must be in standard memory layout".to_string(),
1275            ));
1276        }
1277
1278        let shape = x_f64.shape();
1279        let n_samples = shape[0];
1280        let nfeatures = shape[1];
1281
1282        let lambdas = self.lambdas_.as_ref().unwrap();
1283
1284        if nfeatures != lambdas.len() {
1285            return Err(TransformError::InvalidInput(
1286                "Number of features in inverse transform data does not match fitted data"
1287                    .to_string(),
1288            ));
1289        }
1290
1291        let mut x_normalized = x_f64.clone();
1292
1293        // Reverse standardization if it was applied
1294        if self.standardize {
1295            let means = self.means_.as_ref().unwrap();
1296            let stds = self.stds_.as_ref().unwrap();
1297
1298            for j in 0..nfeatures {
1299                let mean = means[j];
1300                let std = stds[j];
1301
1302                for i in 0..n_samples {
1303                    x_normalized[[i, j]] = x_normalized[[i, j]] * std + mean;
1304                }
1305            }
1306        }
1307
1308        let mut original = Array2::zeros((n_samples, nfeatures));
1309
1310        // Apply inverse transformation in parallel for each feature
1311        let original_data: Vec<Vec<f64>> = (0..nfeatures)
1312            .into_par_iter()
1313            .map(|j| {
1314                let lambda = lambdas[j];
1315                // Apply inverse transformation to all samples for this feature
1316                (0..n_samples)
1317                    .map(|i| {
1318                        let y = x_normalized[[i, j]];
1319                        if self.method == "yeo-johnson" {
1320                            yeo_johnson_inverse_transform(y, lambda)
1321                        } else {
1322                            box_cox_inverse_transform(y, lambda)
1323                        }
1324                    })
1325                    .collect()
1326            })
1327            .collect();
1328
1329        // Copy results back to the array
1330        for j in 0..nfeatures {
1331            for i in 0..n_samples {
1332                original[[i, j]] = original_data[j][i];
1333            }
1334        }
1335
1336        Ok(original)
1337    }
1338
1339    /// Optimizes the lambda parameter for a single feature using maximum likelihood estimation
1340    ///
1341    /// # Arguments
1342    /// * `data` - The feature data to optimize lambda for
1343    ///
1344    /// # Returns
1345    /// * The optimal lambda value
1346    fn optimize_lambda(&self, data: &[f64]) -> f64 {
1347        // Use golden section search to find optimal lambda
1348        let mut a = -2.0;
1349        let mut b = 2.0;
1350        let tolerance = 1e-6;
1351        let golden_ratio = (5.0_f64.sqrt() - 1.0) / 2.0;
1352
1353        // Golden section search
1354        let mut c = b - golden_ratio * (b - a);
1355        let mut d = a + golden_ratio * (b - a);
1356
1357        while (b - a).abs() > tolerance {
1358            let fc = -self.log_likelihood(data, c);
1359            let fd = -self.log_likelihood(data, d);
1360
1361            if fc < fd {
1362                b = d;
1363                d = c;
1364                c = b - golden_ratio * (b - a);
1365            } else {
1366                a = c;
1367                c = d;
1368                d = a + golden_ratio * (b - a);
1369            }
1370        }
1371
1372        (a + b) / 2.0
1373    }
1374
1375    /// Computes the log-likelihood for a given lambda parameter
1376    ///
1377    /// # Arguments
1378    /// * `data` - The feature data
1379    /// * `lambda` - The lambda parameter to evaluate
1380    ///
1381    /// # Returns
1382    /// * The log-likelihood value
1383    fn log_likelihood(&self, data: &[f64], lambda: f64) -> f64 {
1384        let n = data.len() as f64;
1385        let mut log_likelihood = 0.0;
1386
1387        // Transform the data
1388        let transformed: Vec<f64> = data
1389            .iter()
1390            .map(|&x| {
1391                if self.method == "yeo-johnson" {
1392                    yeo_johnson_transform(x, lambda)
1393                } else {
1394                    box_cox_transform(x, lambda)
1395                }
1396            })
1397            .collect();
1398
1399        // Compute mean and variance
1400        let mean = transformed.iter().sum::<f64>() / n;
1401        let variance = transformed.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
1402
1403        if variance <= 0.0 {
1404            return f64::NEG_INFINITY;
1405        }
1406
1407        // Log-likelihood of normal distribution
1408        log_likelihood -= 0.5 * n * (2.0 * std::f64::consts::PI * variance).ln();
1409        log_likelihood -=
1410            0.5 * transformed.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / variance;
1411
1412        // Add Jacobian term
1413        for &x in data {
1414            if self.method == "yeo-johnson" {
1415                log_likelihood += yeo_johnson_log_jacobian(x, lambda);
1416            } else {
1417                log_likelihood += box_cox_log_jacobian(x, lambda);
1418            }
1419        }
1420
1421        log_likelihood
1422    }
1423
1424    /// Returns the fitted lambda parameters
1425    #[allow(dead_code)]
1426    pub fn lambdas(&self) -> Option<&Array1<f64>> {
1427        self.lambdas_.as_ref()
1428    }
1429
1430    /// Returns whether the transformer has been fitted
1431    #[allow(dead_code)]
1432    pub fn is_fitted(&self) -> bool {
1433        self.is_fitted
1434    }
1435}
1436
1437impl Default for PowerTransformer {
1438    fn default() -> Self {
1439        PowerTransformer {
1440            method: "yeo-johnson".to_string(),
1441            standardize: false,
1442            lambdas_: None,
1443            means_: None,
1444            stds_: None,
1445            is_fitted: false,
1446        }
1447    }
1448}
1449
1450/// Apply Yeo-Johnson inverse transformation to a single value
1451#[allow(dead_code)]
1452fn yeo_johnson_inverse_transform(y: f64, lambda: f64) -> f64 {
1453    if y >= 0.0 {
1454        if (lambda - 0.0).abs() < EPSILON {
1455            y.exp() - 1.0
1456        } else {
1457            (lambda * y + 1.0).powf(1.0 / lambda) - 1.0
1458        }
1459    } else if (lambda - 2.0).abs() < EPSILON {
1460        1.0 - (-y).exp()
1461    } else {
1462        1.0 - (-(2.0 - lambda) * y + 1.0).powf(1.0 / (2.0 - lambda))
1463    }
1464}
1465
1466/// Apply Box-Cox inverse transformation to a single value
1467#[allow(dead_code)]
1468fn box_cox_inverse_transform(y: f64, lambda: f64) -> f64 {
1469    if (lambda - 0.0).abs() < EPSILON {
1470        y.exp()
1471    } else {
1472        (lambda * y + 1.0).powf(1.0 / lambda)
1473    }
1474}
1475
1476/// Compute the log of the Jacobian for Yeo-Johnson transformation
1477#[allow(dead_code)]
1478fn yeo_johnson_log_jacobian(x: f64, lambda: f64) -> f64 {
1479    if x >= 0.0 {
1480        (lambda - 1.0) * (x + 1.0).ln()
1481    } else {
1482        (1.0 - lambda) * (-x + 1.0).ln()
1483    }
1484}
1485
1486/// Compute the log of the Jacobian for Box-Cox transformation
1487#[allow(dead_code)]
1488fn box_cox_log_jacobian(x: f64, lambda: f64) -> f64 {
1489    (lambda - 1.0) * x.ln()
1490}
1491
1492/// Creates log-transformed features
1493///
1494/// # Arguments
1495/// * `array` - The input array to transform
1496/// * `epsilon` - A small positive value added to all elements before log
1497///   to avoid taking the log of zero or negative values
1498///
1499/// # Returns
1500/// * `Result<Array2<f64>>` - The log-transformed array
1501#[allow(dead_code)]
1502pub fn log_transform<S>(array: &ArrayBase<S, Ix2>, epsilon: f64) -> Result<Array2<f64>>
1503where
1504    S: Data,
1505    S::Elem: Float + NumCast,
1506{
1507    let array_f64 = array.mapv(|x| NumCast::from(x).unwrap_or(0.0));
1508
1509    if !array_f64.is_standard_layout() {
1510        return Err(TransformError::InvalidInput(
1511            "Input _array must be in standard memory layout".to_string(),
1512        ));
1513    }
1514
1515    if epsilon <= 0.0 {
1516        return Err(TransformError::InvalidInput(
1517            "epsilon must be a positive value".to_string(),
1518        ));
1519    }
1520
1521    let mut transformed = Array2::zeros(array_f64.raw_dim());
1522
1523    for i in 0..array_f64.shape()[0] {
1524        for j in 0..array_f64.shape()[1] {
1525            transformed[[i, j]] = (array_f64[[i, j]] + epsilon).ln();
1526        }
1527    }
1528
1529    Ok(transformed)
1530}
1531
1532#[cfg(test)]
1533mod tests {
1534    use super::*;
1535    use approx::assert_abs_diff_eq;
1536    use scirs2_core::ndarray::Array;
1537
1538    #[test]
1539    fn test_polynomial_features() {
1540        let data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1541
1542        // Test with degree=2, interaction_only=false, includebias=true
1543        let poly = PolynomialFeatures::new(2, false, true);
1544        let transformed = poly.transform(&data).unwrap();
1545
1546        // Expected features: [1, x1, x2, x1^2, x1*x2, x2^2]
1547        assert_eq!(transformed.shape(), &[2, 6]);
1548
1549        // Since the exact order can differ based on implementation,
1550        // let's just verify that the expected results are all present
1551        let expected_values = [1.0, 1.0, 2.0, 1.0, 2.0, 4.0];
1552        let mut found_values = [false; 6];
1553
1554        for i in 0..6 {
1555            for j in 0..6 {
1556                if (transformed[[0, i]] - expected_values[j]).abs() < 1e-10 {
1557                    found_values[j] = true;
1558                }
1559            }
1560        }
1561
1562        assert!(
1563            found_values.iter().all(|&x| x),
1564            "Not all expected values found in the first row"
1565        );
1566
1567        // Second row: Similar approach for second row
1568        let expected_values = [1.0, 3.0, 4.0, 9.0, 12.0, 16.0];
1569        let mut found_values = [false; 6];
1570
1571        for i in 0..6 {
1572            for j in 0..6 {
1573                if (transformed[[1, i]] - expected_values[j]).abs() < 1e-10 {
1574                    found_values[j] = true;
1575                }
1576            }
1577        }
1578
1579        assert!(
1580            found_values.iter().all(|&x| x),
1581            "Not all expected values found in the second row"
1582        );
1583    }
1584
1585    #[test]
1586    fn test_binarize() {
1587        let data = Array::from_shape_vec((2, 3), vec![1.0, -1.0, 2.0, 2.0, 0.0, -3.0]).unwrap();
1588
1589        let binarized = binarize(&data, 0.0).unwrap();
1590
1591        // Expected result: [[1, 0, 1], [1, 0, 0]]
1592        assert_eq!(binarized.shape(), &[2, 3]);
1593
1594        assert_abs_diff_eq!(binarized[[0, 0]], 1.0, epsilon = 1e-10);
1595        assert_abs_diff_eq!(binarized[[0, 1]], 0.0, epsilon = 1e-10);
1596        assert_abs_diff_eq!(binarized[[0, 2]], 1.0, epsilon = 1e-10);
1597        assert_abs_diff_eq!(binarized[[1, 0]], 1.0, epsilon = 1e-10);
1598        assert_abs_diff_eq!(binarized[[1, 1]], 0.0, epsilon = 1e-10);
1599        assert_abs_diff_eq!(binarized[[1, 2]], 0.0, epsilon = 1e-10);
1600    }
1601
1602    #[test]
1603    fn test_discretize_equal_width() {
1604        let data = Array::from_shape_vec((3, 2), vec![0.0, 0.0, 3.0, 5.0, 6.0, 10.0]).unwrap();
1605
1606        // Test with ordinal encoding
1607        let discretized = discretize_equal_width(&data, 2, "ordinal", 0).unwrap();
1608
1609        // Expected result: [[0, 0], [0, 0], [1, 1]]
1610        assert_eq!(discretized.shape(), &[3, 2]);
1611
1612        assert_abs_diff_eq!(discretized[[0, 0]], 0.0, epsilon = 1e-10);
1613        assert_abs_diff_eq!(discretized[[0, 1]], 0.0, epsilon = 1e-10);
1614        assert_abs_diff_eq!(discretized[[1, 0]], 0.0, epsilon = 1e-10);
1615        assert_abs_diff_eq!(discretized[[1, 1]], 0.0, epsilon = 1e-10);
1616        assert_abs_diff_eq!(discretized[[2, 0]], 1.0, epsilon = 1e-10);
1617        assert_abs_diff_eq!(discretized[[2, 1]], 1.0, epsilon = 1e-10);
1618
1619        // Test with one-hot encoding
1620        let discretized = discretize_equal_width(&data, 2, "onehot", 0).unwrap();
1621
1622        // Expected result: [
1623        //   [1, 0, 1, 0],
1624        //   [1, 0, 1, 0],
1625        //   [0, 1, 0, 1]
1626        // ]
1627        assert_eq!(discretized.shape(), &[3, 4]);
1628
1629        assert_abs_diff_eq!(discretized[[0, 0]], 1.0, epsilon = 1e-10);
1630        assert_abs_diff_eq!(discretized[[0, 1]], 0.0, epsilon = 1e-10);
1631        assert_abs_diff_eq!(discretized[[0, 2]], 1.0, epsilon = 1e-10);
1632        assert_abs_diff_eq!(discretized[[0, 3]], 0.0, epsilon = 1e-10);
1633
1634        assert_abs_diff_eq!(discretized[[1, 0]], 1.0, epsilon = 1e-10);
1635        assert_abs_diff_eq!(discretized[[1, 1]], 0.0, epsilon = 1e-10);
1636        assert_abs_diff_eq!(discretized[[1, 2]], 1.0, epsilon = 1e-10);
1637        assert_abs_diff_eq!(discretized[[1, 3]], 0.0, epsilon = 1e-10);
1638
1639        assert_abs_diff_eq!(discretized[[2, 0]], 0.0, epsilon = 1e-10);
1640        assert_abs_diff_eq!(discretized[[2, 1]], 1.0, epsilon = 1e-10);
1641        assert_abs_diff_eq!(discretized[[2, 2]], 0.0, epsilon = 1e-10);
1642        assert_abs_diff_eq!(discretized[[2, 3]], 1.0, epsilon = 1e-10);
1643    }
1644
1645    #[test]
1646    fn test_log_transform() {
1647        let data = Array::from_shape_vec((2, 2), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
1648
1649        let transformed = log_transform(&data, 1.0).unwrap();
1650
1651        // Expected: ln(x + 1)
1652        assert_abs_diff_eq!(transformed[[0, 0]], (0.0 + 1.0).ln(), epsilon = 1e-10);
1653        assert_abs_diff_eq!(transformed[[0, 1]], (1.0 + 1.0).ln(), epsilon = 1e-10);
1654        assert_abs_diff_eq!(transformed[[1, 0]], (2.0 + 1.0).ln(), epsilon = 1e-10);
1655        assert_abs_diff_eq!(transformed[[1, 1]], (3.0 + 1.0).ln(), epsilon = 1e-10);
1656    }
1657
1658    #[test]
1659    fn test_power_transformer_yeo_johnson() {
1660        // Test data that includes negative values (Yeo-Johnson can handle these)
1661        let data =
1662            Array::from_shape_vec((4, 2), vec![1.0, -1.0, 2.0, 0.5, 3.0, -2.0, 0.1, 1.5]).unwrap();
1663
1664        let mut transformer = PowerTransformer::yeo_johnson(false);
1665
1666        // Test that transformer is not fitted initially
1667        assert!(!transformer.is_fitted());
1668
1669        // Fit the transformer
1670        transformer.fit(&data).unwrap();
1671        assert!(transformer.is_fitted());
1672
1673        // Check that lambdas were computed
1674        let lambdas = transformer.lambdas().unwrap();
1675        assert_eq!(lambdas.len(), 2);
1676
1677        // Transform the data
1678        let transformed = transformer.transform(&data).unwrap();
1679        assert_eq!(transformed.shape(), data.shape());
1680
1681        // Test inverse transformation
1682        let inverse = transformer.inverse_transform(&transformed).unwrap();
1683        assert_eq!(inverse.shape(), data.shape());
1684
1685        // Check that inverse transformation approximately recovers original data
1686        for i in 0..data.shape()[0] {
1687            for j in 0..data.shape()[1] {
1688                assert_abs_diff_eq!(inverse[[i, j]], data[[i, j]], epsilon = 1e-6);
1689            }
1690        }
1691    }
1692
1693    #[test]
1694    fn test_power_transformer_box_cox() {
1695        // Test data with strictly positive values (Box-Cox requirement)
1696        let data =
1697            Array::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 0.5, 1.5, 2.5, 3.5]).unwrap();
1698
1699        let mut transformer = PowerTransformer::box_cox(false);
1700
1701        // Fit the transformer
1702        transformer.fit(&data).unwrap();
1703
1704        // Transform the data
1705        let transformed = transformer.transform(&data).unwrap();
1706        assert_eq!(transformed.shape(), data.shape());
1707
1708        // Test inverse transformation
1709        let inverse = transformer.inverse_transform(&transformed).unwrap();
1710
1711        // Check that inverse transformation approximately recovers original data
1712        for i in 0..data.shape()[0] {
1713            for j in 0..data.shape()[1] {
1714                assert_abs_diff_eq!(inverse[[i, j]], data[[i, j]], epsilon = 1e-6);
1715            }
1716        }
1717    }
1718
1719    #[test]
1720    fn test_power_transformer_standardized() {
1721        let data = Array::from_shape_vec(
1722            (5, 2),
1723            vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0],
1724        )
1725        .unwrap();
1726
1727        let mut transformer = PowerTransformer::yeo_johnson(true);
1728
1729        // Fit and transform with standardization
1730        let transformed = transformer.fit_transform(&data).unwrap();
1731
1732        // Check that each feature has approximately zero mean and unit variance
1733        for j in 0..transformed.shape()[1] {
1734            let column = transformed.column(j);
1735            let mean = column.sum() / column.len() as f64;
1736            let variance =
1737                column.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / column.len() as f64;
1738
1739            assert_abs_diff_eq!(mean, 0.0, epsilon = 1e-10);
1740            assert_abs_diff_eq!(variance, 1.0, epsilon = 1e-10);
1741        }
1742    }
1743
1744    #[test]
1745    fn test_power_transformer_box_cox_negative_data() {
1746        // Test that Box-Cox fails with negative data
1747        let data = Array::from_shape_vec((2, 2), vec![1.0, -1.0, 2.0, 3.0]).unwrap();
1748
1749        let mut transformer = PowerTransformer::box_cox(false);
1750
1751        // Should fail because data contains negative values
1752        assert!(transformer.fit(&data).is_err());
1753    }
1754
1755    #[test]
1756    fn test_power_transformer_fit_transform() {
1757        let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1758
1759        let mut transformer = PowerTransformer::yeo_johnson(false);
1760
1761        // Test fit_transform convenience method
1762        let transformed1 = transformer.fit_transform(&data).unwrap();
1763
1764        // Compare with separate fit and transform
1765        let mut transformer2 = PowerTransformer::yeo_johnson(false);
1766        transformer2.fit(&data).unwrap();
1767        let transformed2 = transformer2.transform(&data).unwrap();
1768
1769        // Results should be identical
1770        for i in 0..transformed1.shape()[0] {
1771            for j in 0..transformed1.shape()[1] {
1772                assert_abs_diff_eq!(transformed1[[i, j]], transformed2[[i, j]], epsilon = 1e-12);
1773            }
1774        }
1775    }
1776
1777    #[test]
1778    fn test_power_transformer_different_data_sizes() {
1779        let train_data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1780        let test_data = Array::from_shape_vec((2, 2), vec![2.5, 3.5, 4.5, 5.5]).unwrap();
1781
1782        let mut transformer = PowerTransformer::yeo_johnson(false);
1783
1784        // Fit on training data
1785        transformer.fit(&train_data).unwrap();
1786
1787        // Transform test data (different number of samples)
1788        let transformed = transformer.transform(&test_data).unwrap();
1789        assert_eq!(transformed.shape(), test_data.shape());
1790
1791        // Test inverse transformation on test data
1792        let inverse = transformer.inverse_transform(&transformed).unwrap();
1793
1794        for i in 0..test_data.shape()[0] {
1795            for j in 0..test_data.shape()[1] {
1796                assert_abs_diff_eq!(inverse[[i, j]], test_data[[i, j]], epsilon = 1e-6);
1797            }
1798        }
1799    }
1800
1801    #[test]
1802    fn test_power_transformer_mismatched_features() {
1803        let train_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1804        let test_data = Array::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1805
1806        let mut transformer = PowerTransformer::yeo_johnson(false);
1807        transformer.fit(&train_data).unwrap();
1808
1809        // Should fail because number of features doesn't match
1810        assert!(transformer.transform(&test_data).is_err());
1811    }
1812
1813    #[test]
1814    fn test_power_transformer_not_fitted() {
1815        let data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1816        let transformer = PowerTransformer::yeo_johnson(false);
1817
1818        // Should fail because transformer hasn't been fitted
1819        assert!(transformer.transform(&data).is_err());
1820        assert!(transformer.inverse_transform(&data).is_err());
1821    }
1822
1823    #[test]
1824    fn test_power_transformer_creation_methods() {
1825        // Test different creation methods
1826        let transformer1 = PowerTransformer::new("yeo-johnson", true).unwrap();
1827        let transformer2 = PowerTransformer::yeo_johnson(true);
1828
1829        // Should be equivalent
1830        assert_eq!(transformer1.method, transformer2.method);
1831        assert_eq!(transformer1.standardize, transformer2.standardize);
1832
1833        let transformer3 = PowerTransformer::new("box-cox", false).unwrap();
1834        let transformer4 = PowerTransformer::box_cox(false);
1835
1836        assert_eq!(transformer3.method, transformer4.method);
1837        assert_eq!(transformer3.standardize, transformer4.standardize);
1838
1839        // Test invalid method
1840        assert!(PowerTransformer::new("invalid", false).is_err());
1841    }
1842
1843    #[test]
1844    fn test_power_transformer_empty_data() {
1845        let empty_data = Array2::<f64>::zeros((0, 2));
1846        let mut transformer = PowerTransformer::yeo_johnson(false);
1847
1848        // Should fail with empty data
1849        assert!(transformer.fit(&empty_data).is_err());
1850    }
1851
1852    #[test]
1853    fn test_yeo_johnson_inverse_functions() {
1854        let test_values = vec![-2.0, -0.5, 0.0, 0.5, 1.0, 2.0];
1855        let lambdas = vec![0.0, 0.5, 1.0, 1.5, 2.0];
1856
1857        for &lambda in &lambdas {
1858            for &x in &test_values {
1859                let y = yeo_johnson_transform(x, lambda);
1860                let x_recovered = yeo_johnson_inverse_transform(y, lambda);
1861                assert_abs_diff_eq!(x_recovered, x, epsilon = 1e-10);
1862            }
1863        }
1864    }
1865
1866    #[test]
1867    fn test_box_cox_inverse_functions() {
1868        let test_values = vec![0.1, 0.5, 1.0, 2.0, 5.0]; // Positive values only
1869        let lambdas = vec![0.0, 0.5, 1.0, 1.5, 2.0];
1870
1871        for &lambda in &lambdas {
1872            for &x in &test_values {
1873                let y = box_cox_transform(x, lambda);
1874                let x_recovered = box_cox_inverse_transform(y, lambda);
1875                assert_abs_diff_eq!(x_recovered, x, epsilon = 1e-10);
1876            }
1877        }
1878    }
1879}
scirs2_transform/features.rs

scirs2_transform/
features.rs