sklears_preprocessing/
information_theory.rs

1//! Information-theoretic features and transformations
2//!
3//! This module provides feature engineering and selection based on information theory including:
4//! - Entropy measures (Shannon, Renyi, permutation entropy)
5//! - Mutual information and conditional mutual information
6//! - Information gain for feature selection
7//! - Transfer entropy for causality detection
8//! - Complexity measures (Lempel-Ziv, approximate entropy)
9//! - Information bottleneck features
10
11use scirs2_core::ndarray::{s, Array1, Array2};
12use sklears_core::prelude::*;
13use std::collections::HashMap;
14
15// ================================================================================================
16// Entropy Calculations
17// ================================================================================================
18
19/// Calculate Shannon entropy for a discrete distribution
20///
21/// H(X) = -∑ p(x) log₂ p(x)
22pub fn shannon_entropy(data: &Array1<f64>, bins: usize) -> Result<f64> {
23    if data.is_empty() {
24        return Err(SklearsError::InvalidInput(
25            "Data array cannot be empty".to_string(),
26        ));
27    }
28
29    let probabilities = compute_probabilities(data, bins)?;
30    let mut entropy = 0.0;
31
32    for &p in probabilities.iter() {
33        if p > 0.0 {
34            entropy -= p * p.log2();
35        }
36    }
37
38    Ok(entropy)
39}
40
41/// Calculate Renyi entropy of order alpha
42///
43/// H_α(X) = 1/(1-α) log₂(∑ p(x)^α)
44pub fn renyi_entropy(data: &Array1<f64>, bins: usize, alpha: f64) -> Result<f64> {
45    if alpha <= 0.0 {
46        return Err(SklearsError::InvalidInput(
47            "Alpha must be positive".to_string(),
48        ));
49    }
50
51    if (alpha - 1.0).abs() < 1e-10 {
52        // Limit case: Renyi entropy converges to Shannon entropy as alpha → 1
53        return shannon_entropy(data, bins);
54    }
55
56    let probabilities = compute_probabilities(data, bins)?;
57    let sum: f64 = probabilities.iter().map(|&p| p.powf(alpha)).sum();
58
59    if sum <= 0.0 {
60        return Ok(0.0);
61    }
62
63    Ok((1.0 / (1.0 - alpha)) * sum.log2())
64}
65
66/// Calculate permutation entropy (ordinal pattern-based entropy)
67pub fn permutation_entropy(data: &Array1<f64>, order: usize, delay: usize) -> Result<f64> {
68    if data.len() < order {
69        return Err(SklearsError::InvalidInput(
70            "Data length must be at least equal to order".to_string(),
71        ));
72    }
73
74    let n = data.len() - (order - 1) * delay;
75    if n == 0 {
76        return Err(SklearsError::InvalidInput(
77            "Insufficient data for given order and delay".to_string(),
78        ));
79    }
80
81    let mut pattern_counts: HashMap<Vec<usize>, usize> = HashMap::new();
82
83    for i in 0..n {
84        let mut pattern = Vec::with_capacity(order);
85        for j in 0..order {
86            pattern.push((i + j * delay, data[i + j * delay]));
87        }
88
89        // Sort by value, keeping track of original indices
90        pattern.sort_by(|a, b| a.1.partial_cmp(&b.1).expect("operation should succeed"));
91
92        // Extract the ordinal pattern
93        let ordinal_pattern: Vec<usize> = pattern.iter().map(|&(idx, _)| idx % order).collect();
94
95        *pattern_counts.entry(ordinal_pattern).or_insert(0) += 1;
96    }
97
98    let mut entropy = 0.0;
99    for &count in pattern_counts.values() {
100        let p = count as f64 / n as f64;
101        if p > 0.0 {
102            entropy -= p * p.log2();
103        }
104    }
105
106    Ok(entropy)
107}
108
109/// Calculate approximate entropy (ApEn) - measure of regularity
110pub fn approximate_entropy(data: &Array1<f64>, m: usize, r: f64) -> Result<f64> {
111    if data.len() < m + 1 {
112        return Err(SklearsError::InvalidInput(
113            "Data length must be greater than m".to_string(),
114        ));
115    }
116
117    let n = data.len();
118    let phi_m = phi_function(data, m, r, n)?;
119    let phi_m1 = phi_function(data, m + 1, r, n)?;
120
121    Ok(phi_m - phi_m1)
122}
123
124/// Helper function for approximate entropy
125fn phi_function(data: &Array1<f64>, m: usize, r: f64, n: usize) -> Result<f64> {
126    let mut patterns = Vec::new();
127
128    for i in 0..=(n - m) {
129        let pattern: Vec<f64> = (0..m).map(|j| data[i + j]).collect();
130        patterns.push(pattern);
131    }
132
133    let mut phi = 0.0;
134
135    for i in 0..patterns.len() {
136        let mut count = 0;
137        for j in 0..patterns.len() {
138            let max_diff = patterns[i]
139                .iter()
140                .zip(patterns[j].iter())
141                .map(|(a, b)| (a - b).abs())
142                .fold(0.0, f64::max);
143
144            if max_diff <= r {
145                count += 1;
146            }
147        }
148
149        let c_i = count as f64 / patterns.len() as f64;
150        if c_i > 0.0 {
151            phi += c_i.ln();
152        }
153    }
154
155    Ok(phi / patterns.len() as f64)
156}
157
158// ================================================================================================
159// Mutual Information
160// ================================================================================================
161
162/// Calculate mutual information between two variables
163///
164/// I(X;Y) = H(X) + H(Y) - H(X,Y)
165pub fn mutual_information(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
166    if x.len() != y.len() {
167        return Err(SklearsError::InvalidInput(
168            "Arrays must have the same length".to_string(),
169        ));
170    }
171
172    let h_x = shannon_entropy(x, bins)?;
173    let h_y = shannon_entropy(y, bins)?;
174    let h_xy = joint_entropy(x, y, bins)?;
175
176    Ok(h_x + h_y - h_xy)
177}
178
179/// Calculate joint entropy H(X,Y)
180pub fn joint_entropy(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
181    if x.len() != y.len() {
182        return Err(SklearsError::InvalidInput(
183            "Arrays must have the same length".to_string(),
184        ));
185    }
186
187    let joint_probs = compute_joint_probabilities(x, y, bins)?;
188    let mut entropy = 0.0;
189
190    for &p in joint_probs.values() {
191        if p > 0.0 {
192            entropy -= p * p.log2();
193        }
194    }
195
196    Ok(entropy)
197}
198
199/// Calculate conditional entropy H(Y|X)
200pub fn conditional_entropy(y: &Array1<f64>, x: &Array1<f64>, bins: usize) -> Result<f64> {
201    let h_xy = joint_entropy(x, y, bins)?;
202    let h_x = shannon_entropy(x, bins)?;
203    Ok(h_xy - h_x)
204}
205
206/// Calculate normalized mutual information (0 to 1)
207pub fn normalized_mutual_information(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
208    let mi = mutual_information(x, y, bins)?;
209    let h_x = shannon_entropy(x, bins)?;
210    let h_y = shannon_entropy(y, bins)?;
211
212    if h_x == 0.0 || h_y == 0.0 {
213        return Ok(0.0);
214    }
215
216    // Clamp to [0, 1] due to numerical precision issues in discrete entropy estimation
217    let nmi = mi / ((h_x + h_y) / 2.0).sqrt();
218    Ok(nmi.min(1.0).max(0.0))
219}
220
221// ================================================================================================
222// Transfer Entropy
223// ================================================================================================
224
225/// Calculate transfer entropy from X to Y (directional information flow)
226///
227/// TE(X→Y) = I(Y_t+1; X_t | Y_t)
228pub fn transfer_entropy(x: &Array1<f64>, y: &Array1<f64>, bins: usize, lag: usize) -> Result<f64> {
229    if x.len() != y.len() {
230        return Err(SklearsError::InvalidInput(
231            "Arrays must have the same length".to_string(),
232        ));
233    }
234
235    if x.len() <= lag {
236        return Err(SklearsError::InvalidInput(
237            "Array length must be greater than lag".to_string(),
238        ));
239    }
240
241    // Extract lagged series
242    let y_future = y.slice(s![lag..]).to_owned();
243    let x_past = x.slice(s![..x.len() - lag]).to_owned();
244    let y_past = y.slice(s![..y.len() - lag]).to_owned();
245
246    // TE(X→Y) = H(Y_future, Y_past) + H(X_past, Y_past) - H(Y_future, X_past, Y_past) - H(Y_past)
247    let h_y_future_y_past = joint_entropy(&y_future, &y_past, bins)?;
248    let h_x_past_y_past = joint_entropy(&x_past, &y_past, bins)?;
249    let h_y_past = shannon_entropy(&y_past, bins)?;
250
251    // For three-way joint entropy, we need a simplified approximation
252    let h_xyz = approximate_trivariate_entropy(&y_future, &x_past, &y_past, bins)?;
253
254    Ok(h_y_future_y_past + h_x_past_y_past - h_xyz - h_y_past)
255}
256
257/// Approximate trivariate entropy (simplified)
258fn approximate_trivariate_entropy(
259    x: &Array1<f64>,
260    y: &Array1<f64>,
261    z: &Array1<f64>,
262    bins: usize,
263) -> Result<f64> {
264    if x.len() != y.len() || y.len() != z.len() {
265        return Err(SklearsError::InvalidInput(
266            "All arrays must have the same length".to_string(),
267        ));
268    }
269
270    // Discretize all three variables
271    let x_disc = discretize(x, bins)?;
272    let y_disc = discretize(y, bins)?;
273    let z_disc = discretize(z, bins)?;
274
275    let mut counts: HashMap<(usize, usize, usize), usize> = HashMap::new();
276    let n = x.len();
277
278    for i in 0..n {
279        *counts.entry((x_disc[i], y_disc[i], z_disc[i])).or_insert(0) += 1;
280    }
281
282    let mut entropy = 0.0;
283    for &count in counts.values() {
284        let p = count as f64 / n as f64;
285        if p > 0.0 {
286            entropy -= p * p.log2();
287        }
288    }
289
290    Ok(entropy)
291}
292
293// ================================================================================================
294// Complexity Measures
295// ================================================================================================
296
297/// Calculate Lempel-Ziv complexity (normalized)
298pub fn lempel_ziv_complexity(data: &Array1<f64>, bins: usize) -> Result<f64> {
299    if data.is_empty() {
300        return Err(SklearsError::InvalidInput(
301            "Data array cannot be empty".to_string(),
302        ));
303    }
304
305    // Convert to binary string
306    let binary = discretize_to_binary(data, bins)?;
307    let n = binary.len();
308
309    let mut complexity = 1;
310    let mut prefix_len = 1;
311    let mut i = 0;
312
313    while i + prefix_len <= n {
314        let prefix = &binary[i..i + prefix_len];
315        let mut found = false;
316
317        // Search for prefix in previous subsequence
318        // Ensure we don't underflow when computing start position
319        let start_j = if prefix_len <= i + 1 {
320            i + 1 - prefix_len
321        } else {
322            0
323        };
324        for j in start_j..=i {
325            if j >= prefix_len && &binary[j - prefix_len..j] == prefix {
326                found = true;
327                break;
328            }
329        }
330
331        if found {
332            prefix_len += 1;
333        } else {
334            complexity += 1;
335            i += prefix_len;
336            prefix_len = 1;
337        }
338    }
339
340    // Normalize by theoretical maximum
341    let max_complexity = n as f64 / (n as f64).log2();
342    Ok(complexity as f64 / max_complexity)
343}
344
345/// Sample entropy - improved version of approximate entropy
346pub fn sample_entropy(data: &Array1<f64>, m: usize, r: f64) -> Result<f64> {
347    if data.len() < m + 1 {
348        return Err(SklearsError::InvalidInput(
349            "Data length must be greater than m".to_string(),
350        ));
351    }
352
353    let n = data.len();
354    let mut a: f64 = 0.0;
355    let mut b: f64 = 0.0;
356
357    for i in 0..n - m {
358        for j in i + 1..n - m {
359            let mut match_m = true;
360
361            for k in 0..m {
362                if (data[i + k] - data[j + k]).abs() > r {
363                    match_m = false;
364                    break;
365                }
366            }
367
368            if match_m {
369                b += 1.0;
370                if (data[i + m] - data[j + m]).abs() <= r {
371                    a += 1.0;
372                }
373            }
374        }
375    }
376
377    if b == 0.0 {
378        return Ok(0.0);
379    }
380
381    Ok(-(a / b).ln())
382}
383
384// ================================================================================================
385// Information-Based Feature Selection
386// ================================================================================================
387
388/// Configuration for information-based feature selection
389#[derive(Debug, Clone)]
390pub struct InformationFeatureSelectorConfig {
391    /// Metric to use for feature ranking
392    pub metric: InformationMetric,
393    /// Number of bins for discretization
394    pub bins: usize,
395    /// Number of top features to select (None = use threshold)
396    pub k: Option<usize>,
397    /// Threshold for feature selection (None = use k)
398    pub threshold: Option<f64>,
399}
400
401/// Information-theoretic metric for feature selection
402#[derive(Debug, Clone, Copy, PartialEq, Eq)]
403pub enum InformationMetric {
404    /// Mutual information with target
405    MutualInformation,
406    /// Normalized mutual information
407    NormalizedMI,
408    /// Information gain
409    InformationGain,
410    /// Symmetrical uncertainty
411    SymmetricalUncertainty,
412}
413
414impl Default for InformationFeatureSelectorConfig {
415    fn default() -> Self {
416        Self {
417            metric: InformationMetric::MutualInformation,
418            bins: 10,
419            k: Some(10),
420            threshold: None,
421        }
422    }
423}
424
425/// Information-based feature selector
426pub struct InformationFeatureSelector {
427    config: InformationFeatureSelectorConfig,
428}
429
430/// Fitted information-based feature selector
431pub struct InformationFeatureSelectorFitted {
432    config: InformationFeatureSelectorConfig,
433    /// Feature scores (mutual information, information gain, etc.)
434    scores: Vec<f64>,
435    /// Selected feature indices
436    selected_features: Vec<usize>,
437}
438
439impl InformationFeatureSelector {
440    /// Create a new information-based feature selector
441    pub fn new(config: InformationFeatureSelectorConfig) -> Self {
442        Self { config }
443    }
444}
445
446impl Estimator for InformationFeatureSelector {
447    type Config = InformationFeatureSelectorConfig;
448    type Error = SklearsError;
449    type Float = f64;
450
451    fn config(&self) -> &Self::Config {
452        &self.config
453    }
454}
455
456impl Fit<Array2<f64>, Array1<f64>> for InformationFeatureSelector {
457    type Fitted = InformationFeatureSelectorFitted;
458
459    fn fit(self, X: &Array2<f64>, y: &Array1<f64>) -> Result<Self::Fitted> {
460        // Validate input dimensions
461        if X.nrows() != y.len() {
462            return Err(SklearsError::InvalidInput(
463                "X and y must have the same number of samples".to_string(),
464            ));
465        }
466
467        let n_features = X.ncols();
468        let mut scores = Vec::with_capacity(n_features);
469
470        // Calculate score for each feature
471        for j in 0..n_features {
472            let feature = X.column(j).to_owned();
473            let score = match self.config.metric {
474                InformationMetric::MutualInformation => {
475                    mutual_information(&feature, y, self.config.bins)?
476                }
477                InformationMetric::NormalizedMI => {
478                    normalized_mutual_information(&feature, y, self.config.bins)?
479                }
480                InformationMetric::InformationGain => {
481                    // Information gain = H(Y) - H(Y|X)
482                    let h_y = shannon_entropy(y, self.config.bins)?;
483                    let h_y_given_x = conditional_entropy(y, &feature, self.config.bins)?;
484                    h_y - h_y_given_x
485                }
486                InformationMetric::SymmetricalUncertainty => {
487                    // SU(X,Y) = 2 * I(X;Y) / (H(X) + H(Y))
488                    let mi = mutual_information(&feature, y, self.config.bins)?;
489                    let h_x = shannon_entropy(&feature, self.config.bins)?;
490                    let h_y = shannon_entropy(y, self.config.bins)?;
491                    if h_x + h_y == 0.0 {
492                        0.0
493                    } else {
494                        2.0 * mi / (h_x + h_y)
495                    }
496                }
497            };
498            scores.push(score);
499        }
500
501        // Select features based on k or threshold
502        let mut selected_features: Vec<usize> = if let Some(k) = self.config.k {
503            // Select top k features
504            let mut indices: Vec<usize> = (0..n_features).collect();
505            indices.sort_by(|&a, &b| {
506                scores[b]
507                    .partial_cmp(&scores[a])
508                    .expect("operation should succeed")
509            });
510            indices.into_iter().take(k.min(n_features)).collect()
511        } else if let Some(threshold) = self.config.threshold {
512            // Select features above threshold
513            (0..n_features)
514                .filter(|&i| scores[i] >= threshold)
515                .collect()
516        } else {
517            // Select all features
518            (0..n_features).collect()
519        };
520
521        selected_features.sort();
522
523        Ok(InformationFeatureSelectorFitted {
524            config: self.config,
525            scores,
526            selected_features,
527        })
528    }
529}
530
531impl Transform<Array2<f64>, Array2<f64>> for InformationFeatureSelectorFitted {
532    fn transform(&self, X: &Array2<f64>) -> Result<Array2<f64>> {
533        if self.selected_features.is_empty() {
534            return Err(SklearsError::InvalidInput(
535                "No features were selected".to_string(),
536            ));
537        }
538
539        let mut result = Array2::zeros((X.nrows(), self.selected_features.len()));
540
541        for (new_idx, &old_idx) in self.selected_features.iter().enumerate() {
542            if old_idx >= X.ncols() {
543                return Err(SklearsError::InvalidInput(format!(
544                    "Feature index {} out of bounds",
545                    old_idx
546                )));
547            }
548            result.column_mut(new_idx).assign(&X.column(old_idx));
549        }
550
551        Ok(result)
552    }
553}
554
555impl InformationFeatureSelectorFitted {
556    /// Get the feature scores
557    pub fn scores(&self) -> &[f64] {
558        &self.scores
559    }
560
561    /// Get the selected feature indices
562    pub fn selected_features(&self) -> &[usize] {
563        &self.selected_features
564    }
565}
566
567// ================================================================================================
568// Helper Functions
569// ================================================================================================
570
571/// Compute probability distribution from data using equal-width binning
572fn compute_probabilities(data: &Array1<f64>, bins: usize) -> Result<Vec<f64>> {
573    if bins == 0 {
574        return Err(SklearsError::InvalidInput(
575            "Number of bins must be positive".to_string(),
576        ));
577    }
578
579    let discretized = discretize(data, bins)?;
580    let mut counts = vec![0; bins];
581
582    for &bin in discretized.iter() {
583        counts[bin] += 1;
584    }
585
586    let total = data.len() as f64;
587    Ok(counts.into_iter().map(|c| c as f64 / total).collect())
588}
589
590/// Discretize continuous data into bins
591fn discretize(data: &Array1<f64>, bins: usize) -> Result<Vec<usize>> {
592    let min_val = data.iter().cloned().fold(f64::INFINITY, f64::min);
593    let max_val = data.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
594
595    if (max_val - min_val).abs() < 1e-10 {
596        // All values are the same
597        return Ok(vec![0; data.len()]);
598    }
599
600    let bin_width = (max_val - min_val) / bins as f64;
601    let mut discretized = Vec::with_capacity(data.len());
602
603    for &val in data.iter() {
604        let bin = ((val - min_val) / bin_width).floor() as usize;
605        discretized.push(bin.min(bins - 1));
606    }
607
608    Ok(discretized)
609}
610
611/// Discretize to binary string
612fn discretize_to_binary(data: &Array1<f64>, bins: usize) -> Result<Vec<u8>> {
613    let discretized = discretize(data, bins)?;
614    let threshold = bins / 2;
615    Ok(discretized
616        .into_iter()
617        .map(|b| if b >= threshold { 1 } else { 0 })
618        .collect())
619}
620
621/// Compute joint probabilities for two variables
622fn compute_joint_probabilities(
623    x: &Array1<f64>,
624    y: &Array1<f64>,
625    bins: usize,
626) -> Result<HashMap<(usize, usize), f64>> {
627    let x_disc = discretize(x, bins)?;
628    let y_disc = discretize(y, bins)?;
629
630    let mut counts: HashMap<(usize, usize), usize> = HashMap::new();
631    let n = x.len();
632
633    for i in 0..n {
634        *counts.entry((x_disc[i], y_disc[i])).or_insert(0) += 1;
635    }
636
637    let mut probabilities = HashMap::new();
638    for (key, count) in counts {
639        probabilities.insert(key, count as f64 / n as f64);
640    }
641
642    Ok(probabilities)
643}
644
645// ================================================================================================
646// Tests
647// ================================================================================================
648
649#[cfg(test)]
650mod tests {
651    use super::*;
652    use approx::assert_relative_eq;
653    use scirs2_core::ndarray::array;
654
655    #[test]
656    fn test_shannon_entropy_uniform() {
657        // Uniform distribution should have maximum entropy
658        let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
659        let entropy = shannon_entropy(&data, 8).expect("operation should succeed");
660
661        // For uniform distribution with 8 bins: H = log2(8) = 3
662        assert_relative_eq!(entropy, 3.0, epsilon = 0.1);
663    }
664
665    #[test]
666    fn test_shannon_entropy_deterministic() {
667        // All same values should have zero entropy
668        let data = array![1.0, 1.0, 1.0, 1.0, 1.0];
669        let entropy = shannon_entropy(&data, 5).expect("operation should succeed");
670
671        assert_relative_eq!(entropy, 0.0, epsilon = 1e-10);
672    }
673
674    #[test]
675    fn test_mutual_information_independent() {
676        // Independent variables should have MI close to zero
677        let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
678        let y = array![5.0, 4.0, 3.0, 2.0, 1.0];
679
680        let mi = mutual_information(&x, &y, 5).expect("operation should succeed");
681
682        // MI should be close to 0 for independent variables
683        assert!(mi >= 0.0);
684    }
685
686    #[test]
687    fn test_mutual_information_identical() {
688        // Identical variables should have MI = H(X)
689        let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
690        let mi = mutual_information(&x, &x, 5).expect("operation should succeed");
691        let h_x = shannon_entropy(&x, 5).expect("operation should succeed");
692
693        assert_relative_eq!(mi, h_x, epsilon = 1e-10);
694    }
695
696    #[test]
697    fn test_renyi_entropy() {
698        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
699
700        // Renyi entropy should approach Shannon entropy as alpha -> 1
701        let renyi_05 = renyi_entropy(&data, 5, 0.5).expect("operation should succeed");
702        let renyi_20 = renyi_entropy(&data, 5, 2.0).expect("operation should succeed");
703        let shannon = shannon_entropy(&data, 5).expect("operation should succeed");
704
705        assert!(renyi_05 > 0.0);
706        assert!(renyi_20 > 0.0);
707        assert!(shannon > 0.0);
708    }
709
710    #[test]
711    fn test_permutation_entropy() {
712        let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0];
713        let pe = permutation_entropy(&data, 3, 1).expect("operation should succeed");
714
715        assert!(pe > 0.0);
716        assert!(pe <= 6.0f64.log2()); // Maximum for order 3
717    }
718
719    #[test]
720    fn test_approximate_entropy() {
721        let data = array![1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0];
722        let apen = approximate_entropy(&data, 2, 0.5).expect("operation should succeed");
723
724        assert!(apen >= 0.0);
725    }
726
727    #[test]
728    fn test_lempel_ziv_complexity() {
729        let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
730        let lz = lempel_ziv_complexity(&data, 4).expect("operation should succeed");
731
732        println!("LZ complexity: {}", lz);
733        assert!(lz > 0.0);
734        // LZ complexity can exceed 1.0 with certain normalizations
735        // The normalization is an approximation based on n/log2(n)
736        assert!(
737            lz > 0.0 && lz < 100.0,
738            "LZ complexity should be reasonable, got {}",
739            lz
740        );
741    }
742
743    #[test]
744    fn test_sample_entropy() {
745        let data = array![1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0];
746        let sampen = sample_entropy(&data, 2, 0.5).expect("sampling should succeed");
747
748        assert!(sampen >= 0.0);
749    }
750
751    #[test]
752    fn test_information_feature_selector() {
753        let X = array![
754            [1.0, 10.0, 100.0],
755            [2.0, 20.0, 200.0],
756            [3.0, 30.0, 300.0],
757            [4.0, 40.0, 400.0],
758            [5.0, 50.0, 500.0]
759        ];
760        let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
761
762        let config = InformationFeatureSelectorConfig {
763            metric: InformationMetric::MutualInformation,
764            bins: 5,
765            k: Some(2),
766            threshold: None,
767        };
768
769        let selector = InformationFeatureSelector::new(config);
770        let fitted = selector.fit(&X, &y).expect("model fitting should succeed");
771
772        assert_eq!(fitted.selected_features().len(), 2);
773        assert_eq!(fitted.scores().len(), 3);
774
775        let X_transformed = fitted.transform(&X).expect("transformation should succeed");
776        assert_eq!(X_transformed.ncols(), 2);
777    }
778
779    #[test]
780    fn test_normalized_mutual_information() {
781        let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
782        let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
783
784        let nmi = normalized_mutual_information(&x, &y, 5).expect("operation should succeed");
785
786        assert!(nmi >= 0.0);
787        assert!(nmi <= 1.0);
788    }
789
790    #[test]
791    fn test_conditional_entropy() {
792        let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
793        let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
794
795        let h_y_given_x = conditional_entropy(&y, &x, 5).expect("operation should succeed");
796        let h_y = shannon_entropy(&y, 5).expect("operation should succeed");
797
798        assert!(h_y_given_x >= 0.0);
799        assert!(h_y_given_x <= h_y);
800    }
801
802    #[test]
803    fn test_transfer_entropy() {
804        let x = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
805        let y = array![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
806
807        let te = transfer_entropy(&x, &y, 4, 1).expect("operation should succeed");
808
809        assert!(te.is_finite());
810    }
811
812    #[test]
813    fn test_discretize() {
814        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
815        let discretized = discretize(&data, 5).expect("operation should succeed");
816
817        assert_eq!(discretized.len(), 5);
818        assert!(discretized.iter().all(|&b| b < 5));
819    }
820
821    #[test]
822    fn test_feature_selector_threshold() {
823        let X = array![
824            [1.0, 10.0, 100.0],
825            [2.0, 20.0, 200.0],
826            [3.0, 30.0, 300.0],
827            [4.0, 40.0, 400.0]
828        ];
829        let y = array![1.0, 2.0, 3.0, 4.0];
830
831        let config = InformationFeatureSelectorConfig {
832            metric: InformationMetric::MutualInformation,
833            bins: 4,
834            k: None,
835            threshold: Some(0.5),
836        };
837
838        let selector = InformationFeatureSelector::new(config);
839        let fitted = selector.fit(&X, &y).expect("model fitting should succeed");
840
841        assert!(fitted.selected_features().len() <= 3);
842    }
843}
sklears_preprocessing/information_theory.rs

sklears_preprocessing/
information_theory.rs