1use scirs2_core::ndarray::{s, Array1, Array2};
12use sklears_core::prelude::*;
13use std::collections::HashMap;
14
15pub fn shannon_entropy(data: &Array1<f64>, bins: usize) -> Result<f64> {
23 if data.is_empty() {
24 return Err(SklearsError::InvalidInput(
25 "Data array cannot be empty".to_string(),
26 ));
27 }
28
29 let probabilities = compute_probabilities(data, bins)?;
30 let mut entropy = 0.0;
31
32 for &p in probabilities.iter() {
33 if p > 0.0 {
34 entropy -= p * p.log2();
35 }
36 }
37
38 Ok(entropy)
39}
40
41pub fn renyi_entropy(data: &Array1<f64>, bins: usize, alpha: f64) -> Result<f64> {
45 if alpha <= 0.0 {
46 return Err(SklearsError::InvalidInput(
47 "Alpha must be positive".to_string(),
48 ));
49 }
50
51 if (alpha - 1.0).abs() < 1e-10 {
52 return shannon_entropy(data, bins);
54 }
55
56 let probabilities = compute_probabilities(data, bins)?;
57 let sum: f64 = probabilities.iter().map(|&p| p.powf(alpha)).sum();
58
59 if sum <= 0.0 {
60 return Ok(0.0);
61 }
62
63 Ok((1.0 / (1.0 - alpha)) * sum.log2())
64}
65
66pub fn permutation_entropy(data: &Array1<f64>, order: usize, delay: usize) -> Result<f64> {
68 if data.len() < order {
69 return Err(SklearsError::InvalidInput(
70 "Data length must be at least equal to order".to_string(),
71 ));
72 }
73
74 let n = data.len() - (order - 1) * delay;
75 if n == 0 {
76 return Err(SklearsError::InvalidInput(
77 "Insufficient data for given order and delay".to_string(),
78 ));
79 }
80
81 let mut pattern_counts: HashMap<Vec<usize>, usize> = HashMap::new();
82
83 for i in 0..n {
84 let mut pattern = Vec::with_capacity(order);
85 for j in 0..order {
86 pattern.push((i + j * delay, data[i + j * delay]));
87 }
88
89 pattern.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
91
92 let ordinal_pattern: Vec<usize> = pattern.iter().map(|&(idx, _)| idx % order).collect();
94
95 *pattern_counts.entry(ordinal_pattern).or_insert(0) += 1;
96 }
97
98 let mut entropy = 0.0;
99 for &count in pattern_counts.values() {
100 let p = count as f64 / n as f64;
101 if p > 0.0 {
102 entropy -= p * p.log2();
103 }
104 }
105
106 Ok(entropy)
107}
108
109pub fn approximate_entropy(data: &Array1<f64>, m: usize, r: f64) -> Result<f64> {
111 if data.len() < m + 1 {
112 return Err(SklearsError::InvalidInput(
113 "Data length must be greater than m".to_string(),
114 ));
115 }
116
117 let n = data.len();
118 let phi_m = phi_function(data, m, r, n)?;
119 let phi_m1 = phi_function(data, m + 1, r, n)?;
120
121 Ok(phi_m - phi_m1)
122}
123
124fn phi_function(data: &Array1<f64>, m: usize, r: f64, n: usize) -> Result<f64> {
126 let mut patterns = Vec::new();
127
128 for i in 0..=(n - m) {
129 let pattern: Vec<f64> = (0..m).map(|j| data[i + j]).collect();
130 patterns.push(pattern);
131 }
132
133 let mut phi = 0.0;
134
135 for i in 0..patterns.len() {
136 let mut count = 0;
137 for j in 0..patterns.len() {
138 let max_diff = patterns[i]
139 .iter()
140 .zip(patterns[j].iter())
141 .map(|(a, b)| (a - b).abs())
142 .fold(0.0, f64::max);
143
144 if max_diff <= r {
145 count += 1;
146 }
147 }
148
149 let c_i = count as f64 / patterns.len() as f64;
150 if c_i > 0.0 {
151 phi += c_i.ln();
152 }
153 }
154
155 Ok(phi / patterns.len() as f64)
156}
157
158pub fn mutual_information(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
166 if x.len() != y.len() {
167 return Err(SklearsError::InvalidInput(
168 "Arrays must have the same length".to_string(),
169 ));
170 }
171
172 let h_x = shannon_entropy(x, bins)?;
173 let h_y = shannon_entropy(y, bins)?;
174 let h_xy = joint_entropy(x, y, bins)?;
175
176 Ok(h_x + h_y - h_xy)
177}
178
179pub fn joint_entropy(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
181 if x.len() != y.len() {
182 return Err(SklearsError::InvalidInput(
183 "Arrays must have the same length".to_string(),
184 ));
185 }
186
187 let joint_probs = compute_joint_probabilities(x, y, bins)?;
188 let mut entropy = 0.0;
189
190 for &p in joint_probs.values() {
191 if p > 0.0 {
192 entropy -= p * p.log2();
193 }
194 }
195
196 Ok(entropy)
197}
198
199pub fn conditional_entropy(y: &Array1<f64>, x: &Array1<f64>, bins: usize) -> Result<f64> {
201 let h_xy = joint_entropy(x, y, bins)?;
202 let h_x = shannon_entropy(x, bins)?;
203 Ok(h_xy - h_x)
204}
205
206pub fn normalized_mutual_information(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
208 let mi = mutual_information(x, y, bins)?;
209 let h_x = shannon_entropy(x, bins)?;
210 let h_y = shannon_entropy(y, bins)?;
211
212 if h_x == 0.0 || h_y == 0.0 {
213 return Ok(0.0);
214 }
215
216 let nmi = mi / ((h_x + h_y) / 2.0).sqrt();
218 Ok(nmi.min(1.0).max(0.0))
219}
220
221pub fn transfer_entropy(x: &Array1<f64>, y: &Array1<f64>, bins: usize, lag: usize) -> Result<f64> {
229 if x.len() != y.len() {
230 return Err(SklearsError::InvalidInput(
231 "Arrays must have the same length".to_string(),
232 ));
233 }
234
235 if x.len() <= lag {
236 return Err(SklearsError::InvalidInput(
237 "Array length must be greater than lag".to_string(),
238 ));
239 }
240
241 let y_future = y.slice(s![lag..]).to_owned();
243 let x_past = x.slice(s![..x.len() - lag]).to_owned();
244 let y_past = y.slice(s![..y.len() - lag]).to_owned();
245
246 let h_y_future_y_past = joint_entropy(&y_future, &y_past, bins)?;
248 let h_x_past_y_past = joint_entropy(&x_past, &y_past, bins)?;
249 let h_y_past = shannon_entropy(&y_past, bins)?;
250
251 let h_xyz = approximate_trivariate_entropy(&y_future, &x_past, &y_past, bins)?;
253
254 Ok(h_y_future_y_past + h_x_past_y_past - h_xyz - h_y_past)
255}
256
257fn approximate_trivariate_entropy(
259 x: &Array1<f64>,
260 y: &Array1<f64>,
261 z: &Array1<f64>,
262 bins: usize,
263) -> Result<f64> {
264 if x.len() != y.len() || y.len() != z.len() {
265 return Err(SklearsError::InvalidInput(
266 "All arrays must have the same length".to_string(),
267 ));
268 }
269
270 let x_disc = discretize(x, bins)?;
272 let y_disc = discretize(y, bins)?;
273 let z_disc = discretize(z, bins)?;
274
275 let mut counts: HashMap<(usize, usize, usize), usize> = HashMap::new();
276 let n = x.len();
277
278 for i in 0..n {
279 *counts.entry((x_disc[i], y_disc[i], z_disc[i])).or_insert(0) += 1;
280 }
281
282 let mut entropy = 0.0;
283 for &count in counts.values() {
284 let p = count as f64 / n as f64;
285 if p > 0.0 {
286 entropy -= p * p.log2();
287 }
288 }
289
290 Ok(entropy)
291}
292
293pub fn lempel_ziv_complexity(data: &Array1<f64>, bins: usize) -> Result<f64> {
299 if data.is_empty() {
300 return Err(SklearsError::InvalidInput(
301 "Data array cannot be empty".to_string(),
302 ));
303 }
304
305 let binary = discretize_to_binary(data, bins)?;
307 let n = binary.len();
308
309 let mut complexity = 1;
310 let mut prefix_len = 1;
311 let mut i = 0;
312
313 while i + prefix_len <= n {
314 let prefix = &binary[i..i + prefix_len];
315 let mut found = false;
316
317 let start_j = if prefix_len <= i + 1 {
320 i + 1 - prefix_len
321 } else {
322 0
323 };
324 for j in start_j..=i {
325 if j >= prefix_len && &binary[j - prefix_len..j] == prefix {
326 found = true;
327 break;
328 }
329 }
330
331 if found {
332 prefix_len += 1;
333 } else {
334 complexity += 1;
335 i += prefix_len;
336 prefix_len = 1;
337 }
338 }
339
340 let max_complexity = n as f64 / (n as f64).log2();
342 Ok(complexity as f64 / max_complexity)
343}
344
345pub fn sample_entropy(data: &Array1<f64>, m: usize, r: f64) -> Result<f64> {
347 if data.len() < m + 1 {
348 return Err(SklearsError::InvalidInput(
349 "Data length must be greater than m".to_string(),
350 ));
351 }
352
353 let n = data.len();
354 let mut a: f64 = 0.0;
355 let mut b: f64 = 0.0;
356
357 for i in 0..n - m {
358 for j in i + 1..n - m {
359 let mut match_m = true;
360
361 for k in 0..m {
362 if (data[i + k] - data[j + k]).abs() > r {
363 match_m = false;
364 break;
365 }
366 }
367
368 if match_m {
369 b += 1.0;
370 if (data[i + m] - data[j + m]).abs() <= r {
371 a += 1.0;
372 }
373 }
374 }
375 }
376
377 if b == 0.0 {
378 return Ok(0.0);
379 }
380
381 Ok(-(a / b).ln())
382}
383
384#[derive(Debug, Clone)]
390pub struct InformationFeatureSelectorConfig {
391 pub metric: InformationMetric,
393 pub bins: usize,
395 pub k: Option<usize>,
397 pub threshold: Option<f64>,
399}
400
401#[derive(Debug, Clone, Copy, PartialEq, Eq)]
403pub enum InformationMetric {
404 MutualInformation,
406 NormalizedMI,
408 InformationGain,
410 SymmetricalUncertainty,
412}
413
414impl Default for InformationFeatureSelectorConfig {
415 fn default() -> Self {
416 Self {
417 metric: InformationMetric::MutualInformation,
418 bins: 10,
419 k: Some(10),
420 threshold: None,
421 }
422 }
423}
424
425pub struct InformationFeatureSelector {
427 config: InformationFeatureSelectorConfig,
428}
429
430pub struct InformationFeatureSelectorFitted {
432 config: InformationFeatureSelectorConfig,
433 scores: Vec<f64>,
435 selected_features: Vec<usize>,
437}
438
439impl InformationFeatureSelector {
440 pub fn new(config: InformationFeatureSelectorConfig) -> Self {
442 Self { config }
443 }
444}
445
446impl Estimator for InformationFeatureSelector {
447 type Config = InformationFeatureSelectorConfig;
448 type Error = SklearsError;
449 type Float = f64;
450
451 fn config(&self) -> &Self::Config {
452 &self.config
453 }
454}
455
456impl Fit<Array2<f64>, Array1<f64>> for InformationFeatureSelector {
457 type Fitted = InformationFeatureSelectorFitted;
458
459 fn fit(self, X: &Array2<f64>, y: &Array1<f64>) -> Result<Self::Fitted> {
460 if X.nrows() != y.len() {
462 return Err(SklearsError::InvalidInput(
463 "X and y must have the same number of samples".to_string(),
464 ));
465 }
466
467 let n_features = X.ncols();
468 let mut scores = Vec::with_capacity(n_features);
469
470 for j in 0..n_features {
472 let feature = X.column(j).to_owned();
473 let score = match self.config.metric {
474 InformationMetric::MutualInformation => {
475 mutual_information(&feature, y, self.config.bins)?
476 }
477 InformationMetric::NormalizedMI => {
478 normalized_mutual_information(&feature, y, self.config.bins)?
479 }
480 InformationMetric::InformationGain => {
481 let h_y = shannon_entropy(y, self.config.bins)?;
483 let h_y_given_x = conditional_entropy(y, &feature, self.config.bins)?;
484 h_y - h_y_given_x
485 }
486 InformationMetric::SymmetricalUncertainty => {
487 let mi = mutual_information(&feature, y, self.config.bins)?;
489 let h_x = shannon_entropy(&feature, self.config.bins)?;
490 let h_y = shannon_entropy(y, self.config.bins)?;
491 if h_x + h_y == 0.0 {
492 0.0
493 } else {
494 2.0 * mi / (h_x + h_y)
495 }
496 }
497 };
498 scores.push(score);
499 }
500
501 let mut selected_features: Vec<usize> = if let Some(k) = self.config.k {
503 let mut indices: Vec<usize> = (0..n_features).collect();
505 indices.sort_by(|&a, &b| scores[b].partial_cmp(&scores[a]).unwrap());
506 indices.into_iter().take(k.min(n_features)).collect()
507 } else if let Some(threshold) = self.config.threshold {
508 (0..n_features)
510 .filter(|&i| scores[i] >= threshold)
511 .collect()
512 } else {
513 (0..n_features).collect()
515 };
516
517 selected_features.sort();
518
519 Ok(InformationFeatureSelectorFitted {
520 config: self.config,
521 scores,
522 selected_features,
523 })
524 }
525}
526
527impl Transform<Array2<f64>, Array2<f64>> for InformationFeatureSelectorFitted {
528 fn transform(&self, X: &Array2<f64>) -> Result<Array2<f64>> {
529 if self.selected_features.is_empty() {
530 return Err(SklearsError::InvalidInput(
531 "No features were selected".to_string(),
532 ));
533 }
534
535 let mut result = Array2::zeros((X.nrows(), self.selected_features.len()));
536
537 for (new_idx, &old_idx) in self.selected_features.iter().enumerate() {
538 if old_idx >= X.ncols() {
539 return Err(SklearsError::InvalidInput(format!(
540 "Feature index {} out of bounds",
541 old_idx
542 )));
543 }
544 result.column_mut(new_idx).assign(&X.column(old_idx));
545 }
546
547 Ok(result)
548 }
549}
550
551impl InformationFeatureSelectorFitted {
552 pub fn scores(&self) -> &[f64] {
554 &self.scores
555 }
556
557 pub fn selected_features(&self) -> &[usize] {
559 &self.selected_features
560 }
561}
562
563fn compute_probabilities(data: &Array1<f64>, bins: usize) -> Result<Vec<f64>> {
569 if bins == 0 {
570 return Err(SklearsError::InvalidInput(
571 "Number of bins must be positive".to_string(),
572 ));
573 }
574
575 let discretized = discretize(data, bins)?;
576 let mut counts = vec![0; bins];
577
578 for &bin in discretized.iter() {
579 counts[bin] += 1;
580 }
581
582 let total = data.len() as f64;
583 Ok(counts.into_iter().map(|c| c as f64 / total).collect())
584}
585
586fn discretize(data: &Array1<f64>, bins: usize) -> Result<Vec<usize>> {
588 let min_val = data.iter().cloned().fold(f64::INFINITY, f64::min);
589 let max_val = data.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
590
591 if (max_val - min_val).abs() < 1e-10 {
592 return Ok(vec![0; data.len()]);
594 }
595
596 let bin_width = (max_val - min_val) / bins as f64;
597 let mut discretized = Vec::with_capacity(data.len());
598
599 for &val in data.iter() {
600 let bin = ((val - min_val) / bin_width).floor() as usize;
601 discretized.push(bin.min(bins - 1));
602 }
603
604 Ok(discretized)
605}
606
607fn discretize_to_binary(data: &Array1<f64>, bins: usize) -> Result<Vec<u8>> {
609 let discretized = discretize(data, bins)?;
610 let threshold = bins / 2;
611 Ok(discretized
612 .into_iter()
613 .map(|b| if b >= threshold { 1 } else { 0 })
614 .collect())
615}
616
617fn compute_joint_probabilities(
619 x: &Array1<f64>,
620 y: &Array1<f64>,
621 bins: usize,
622) -> Result<HashMap<(usize, usize), f64>> {
623 let x_disc = discretize(x, bins)?;
624 let y_disc = discretize(y, bins)?;
625
626 let mut counts: HashMap<(usize, usize), usize> = HashMap::new();
627 let n = x.len();
628
629 for i in 0..n {
630 *counts.entry((x_disc[i], y_disc[i])).or_insert(0) += 1;
631 }
632
633 let mut probabilities = HashMap::new();
634 for (key, count) in counts {
635 probabilities.insert(key, count as f64 / n as f64);
636 }
637
638 Ok(probabilities)
639}
640
641#[cfg(test)]
646mod tests {
647 use super::*;
648 use approx::assert_relative_eq;
649 use scirs2_core::ndarray::array;
650
651 #[test]
652 fn test_shannon_entropy_uniform() {
653 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
655 let entropy = shannon_entropy(&data, 8).unwrap();
656
657 assert_relative_eq!(entropy, 3.0, epsilon = 0.1);
659 }
660
661 #[test]
662 fn test_shannon_entropy_deterministic() {
663 let data = array![1.0, 1.0, 1.0, 1.0, 1.0];
665 let entropy = shannon_entropy(&data, 5).unwrap();
666
667 assert_relative_eq!(entropy, 0.0, epsilon = 1e-10);
668 }
669
670 #[test]
671 fn test_mutual_information_independent() {
672 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
674 let y = array![5.0, 4.0, 3.0, 2.0, 1.0];
675
676 let mi = mutual_information(&x, &y, 5).unwrap();
677
678 assert!(mi >= 0.0);
680 }
681
682 #[test]
683 fn test_mutual_information_identical() {
684 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
686 let mi = mutual_information(&x, &x, 5).unwrap();
687 let h_x = shannon_entropy(&x, 5).unwrap();
688
689 assert_relative_eq!(mi, h_x, epsilon = 1e-10);
690 }
691
692 #[test]
693 fn test_renyi_entropy() {
694 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
695
696 let renyi_05 = renyi_entropy(&data, 5, 0.5).unwrap();
698 let renyi_20 = renyi_entropy(&data, 5, 2.0).unwrap();
699 let shannon = shannon_entropy(&data, 5).unwrap();
700
701 assert!(renyi_05 > 0.0);
702 assert!(renyi_20 > 0.0);
703 assert!(shannon > 0.0);
704 }
705
706 #[test]
707 fn test_permutation_entropy() {
708 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0];
709 let pe = permutation_entropy(&data, 3, 1).unwrap();
710
711 assert!(pe > 0.0);
712 assert!(pe <= 6.0f64.log2()); }
714
715 #[test]
716 fn test_approximate_entropy() {
717 let data = array![1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0];
718 let apen = approximate_entropy(&data, 2, 0.5).unwrap();
719
720 assert!(apen >= 0.0);
721 }
722
723 #[test]
724 fn test_lempel_ziv_complexity() {
725 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
726 let lz = lempel_ziv_complexity(&data, 4).unwrap();
727
728 println!("LZ complexity: {}", lz);
729 assert!(lz > 0.0);
730 assert!(
733 lz > 0.0 && lz < 100.0,
734 "LZ complexity should be reasonable, got {}",
735 lz
736 );
737 }
738
739 #[test]
740 fn test_sample_entropy() {
741 let data = array![1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0];
742 let sampen = sample_entropy(&data, 2, 0.5).unwrap();
743
744 assert!(sampen >= 0.0);
745 }
746
747 #[test]
748 fn test_information_feature_selector() {
749 let X = array![
750 [1.0, 10.0, 100.0],
751 [2.0, 20.0, 200.0],
752 [3.0, 30.0, 300.0],
753 [4.0, 40.0, 400.0],
754 [5.0, 50.0, 500.0]
755 ];
756 let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
757
758 let config = InformationFeatureSelectorConfig {
759 metric: InformationMetric::MutualInformation,
760 bins: 5,
761 k: Some(2),
762 threshold: None,
763 };
764
765 let selector = InformationFeatureSelector::new(config);
766 let fitted = selector.fit(&X, &y).unwrap();
767
768 assert_eq!(fitted.selected_features().len(), 2);
769 assert_eq!(fitted.scores().len(), 3);
770
771 let X_transformed = fitted.transform(&X).unwrap();
772 assert_eq!(X_transformed.ncols(), 2);
773 }
774
775 #[test]
776 fn test_normalized_mutual_information() {
777 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
778 let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
779
780 let nmi = normalized_mutual_information(&x, &y, 5).unwrap();
781
782 assert!(nmi >= 0.0);
783 assert!(nmi <= 1.0);
784 }
785
786 #[test]
787 fn test_conditional_entropy() {
788 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
789 let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
790
791 let h_y_given_x = conditional_entropy(&y, &x, 5).unwrap();
792 let h_y = shannon_entropy(&y, 5).unwrap();
793
794 assert!(h_y_given_x >= 0.0);
795 assert!(h_y_given_x <= h_y);
796 }
797
798 #[test]
799 fn test_transfer_entropy() {
800 let x = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
801 let y = array![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
802
803 let te = transfer_entropy(&x, &y, 4, 1).unwrap();
804
805 assert!(te.is_finite());
806 }
807
808 #[test]
809 fn test_discretize() {
810 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
811 let discretized = discretize(&data, 5).unwrap();
812
813 assert_eq!(discretized.len(), 5);
814 assert!(discretized.iter().all(|&b| b < 5));
815 }
816
817 #[test]
818 fn test_feature_selector_threshold() {
819 let X = array![
820 [1.0, 10.0, 100.0],
821 [2.0, 20.0, 200.0],
822 [3.0, 30.0, 300.0],
823 [4.0, 40.0, 400.0]
824 ];
825 let y = array![1.0, 2.0, 3.0, 4.0];
826
827 let config = InformationFeatureSelectorConfig {
828 metric: InformationMetric::MutualInformation,
829 bins: 4,
830 k: None,
831 threshold: Some(0.5),
832 };
833
834 let selector = InformationFeatureSelector::new(config);
835 let fitted = selector.fit(&X, &y).unwrap();
836
837 assert!(fitted.selected_features().len() <= 3);
838 }
839}