1use scirs2_core::ndarray::{s, Array1, Array2};
12use sklears_core::prelude::*;
13use std::collections::HashMap;
14
15pub fn shannon_entropy(data: &Array1<f64>, bins: usize) -> Result<f64> {
23 if data.is_empty() {
24 return Err(SklearsError::InvalidInput(
25 "Data array cannot be empty".to_string(),
26 ));
27 }
28
29 let probabilities = compute_probabilities(data, bins)?;
30 let mut entropy = 0.0;
31
32 for &p in probabilities.iter() {
33 if p > 0.0 {
34 entropy -= p * p.log2();
35 }
36 }
37
38 Ok(entropy)
39}
40
41pub fn renyi_entropy(data: &Array1<f64>, bins: usize, alpha: f64) -> Result<f64> {
45 if alpha <= 0.0 {
46 return Err(SklearsError::InvalidInput(
47 "Alpha must be positive".to_string(),
48 ));
49 }
50
51 if (alpha - 1.0).abs() < 1e-10 {
52 return shannon_entropy(data, bins);
54 }
55
56 let probabilities = compute_probabilities(data, bins)?;
57 let sum: f64 = probabilities.iter().map(|&p| p.powf(alpha)).sum();
58
59 if sum <= 0.0 {
60 return Ok(0.0);
61 }
62
63 Ok((1.0 / (1.0 - alpha)) * sum.log2())
64}
65
66pub fn permutation_entropy(data: &Array1<f64>, order: usize, delay: usize) -> Result<f64> {
68 if data.len() < order {
69 return Err(SklearsError::InvalidInput(
70 "Data length must be at least equal to order".to_string(),
71 ));
72 }
73
74 let n = data.len() - (order - 1) * delay;
75 if n == 0 {
76 return Err(SklearsError::InvalidInput(
77 "Insufficient data for given order and delay".to_string(),
78 ));
79 }
80
81 let mut pattern_counts: HashMap<Vec<usize>, usize> = HashMap::new();
82
83 for i in 0..n {
84 let mut pattern = Vec::with_capacity(order);
85 for j in 0..order {
86 pattern.push((i + j * delay, data[i + j * delay]));
87 }
88
89 pattern.sort_by(|a, b| a.1.partial_cmp(&b.1).expect("operation should succeed"));
91
92 let ordinal_pattern: Vec<usize> = pattern.iter().map(|&(idx, _)| idx % order).collect();
94
95 *pattern_counts.entry(ordinal_pattern).or_insert(0) += 1;
96 }
97
98 let mut entropy = 0.0;
99 for &count in pattern_counts.values() {
100 let p = count as f64 / n as f64;
101 if p > 0.0 {
102 entropy -= p * p.log2();
103 }
104 }
105
106 Ok(entropy)
107}
108
109pub fn approximate_entropy(data: &Array1<f64>, m: usize, r: f64) -> Result<f64> {
111 if data.len() < m + 1 {
112 return Err(SklearsError::InvalidInput(
113 "Data length must be greater than m".to_string(),
114 ));
115 }
116
117 let n = data.len();
118 let phi_m = phi_function(data, m, r, n)?;
119 let phi_m1 = phi_function(data, m + 1, r, n)?;
120
121 Ok(phi_m - phi_m1)
122}
123
124fn phi_function(data: &Array1<f64>, m: usize, r: f64, n: usize) -> Result<f64> {
126 let mut patterns = Vec::new();
127
128 for i in 0..=(n - m) {
129 let pattern: Vec<f64> = (0..m).map(|j| data[i + j]).collect();
130 patterns.push(pattern);
131 }
132
133 let mut phi = 0.0;
134
135 for i in 0..patterns.len() {
136 let mut count = 0;
137 for j in 0..patterns.len() {
138 let max_diff = patterns[i]
139 .iter()
140 .zip(patterns[j].iter())
141 .map(|(a, b)| (a - b).abs())
142 .fold(0.0, f64::max);
143
144 if max_diff <= r {
145 count += 1;
146 }
147 }
148
149 let c_i = count as f64 / patterns.len() as f64;
150 if c_i > 0.0 {
151 phi += c_i.ln();
152 }
153 }
154
155 Ok(phi / patterns.len() as f64)
156}
157
158pub fn mutual_information(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
166 if x.len() != y.len() {
167 return Err(SklearsError::InvalidInput(
168 "Arrays must have the same length".to_string(),
169 ));
170 }
171
172 let h_x = shannon_entropy(x, bins)?;
173 let h_y = shannon_entropy(y, bins)?;
174 let h_xy = joint_entropy(x, y, bins)?;
175
176 Ok(h_x + h_y - h_xy)
177}
178
179pub fn joint_entropy(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
181 if x.len() != y.len() {
182 return Err(SklearsError::InvalidInput(
183 "Arrays must have the same length".to_string(),
184 ));
185 }
186
187 let joint_probs = compute_joint_probabilities(x, y, bins)?;
188 let mut entropy = 0.0;
189
190 for &p in joint_probs.values() {
191 if p > 0.0 {
192 entropy -= p * p.log2();
193 }
194 }
195
196 Ok(entropy)
197}
198
199pub fn conditional_entropy(y: &Array1<f64>, x: &Array1<f64>, bins: usize) -> Result<f64> {
201 let h_xy = joint_entropy(x, y, bins)?;
202 let h_x = shannon_entropy(x, bins)?;
203 Ok(h_xy - h_x)
204}
205
206pub fn normalized_mutual_information(x: &Array1<f64>, y: &Array1<f64>, bins: usize) -> Result<f64> {
208 let mi = mutual_information(x, y, bins)?;
209 let h_x = shannon_entropy(x, bins)?;
210 let h_y = shannon_entropy(y, bins)?;
211
212 if h_x == 0.0 || h_y == 0.0 {
213 return Ok(0.0);
214 }
215
216 let nmi = mi / ((h_x + h_y) / 2.0).sqrt();
218 Ok(nmi.min(1.0).max(0.0))
219}
220
221pub fn transfer_entropy(x: &Array1<f64>, y: &Array1<f64>, bins: usize, lag: usize) -> Result<f64> {
229 if x.len() != y.len() {
230 return Err(SklearsError::InvalidInput(
231 "Arrays must have the same length".to_string(),
232 ));
233 }
234
235 if x.len() <= lag {
236 return Err(SklearsError::InvalidInput(
237 "Array length must be greater than lag".to_string(),
238 ));
239 }
240
241 let y_future = y.slice(s![lag..]).to_owned();
243 let x_past = x.slice(s![..x.len() - lag]).to_owned();
244 let y_past = y.slice(s![..y.len() - lag]).to_owned();
245
246 let h_y_future_y_past = joint_entropy(&y_future, &y_past, bins)?;
248 let h_x_past_y_past = joint_entropy(&x_past, &y_past, bins)?;
249 let h_y_past = shannon_entropy(&y_past, bins)?;
250
251 let h_xyz = approximate_trivariate_entropy(&y_future, &x_past, &y_past, bins)?;
253
254 Ok(h_y_future_y_past + h_x_past_y_past - h_xyz - h_y_past)
255}
256
257fn approximate_trivariate_entropy(
259 x: &Array1<f64>,
260 y: &Array1<f64>,
261 z: &Array1<f64>,
262 bins: usize,
263) -> Result<f64> {
264 if x.len() != y.len() || y.len() != z.len() {
265 return Err(SklearsError::InvalidInput(
266 "All arrays must have the same length".to_string(),
267 ));
268 }
269
270 let x_disc = discretize(x, bins)?;
272 let y_disc = discretize(y, bins)?;
273 let z_disc = discretize(z, bins)?;
274
275 let mut counts: HashMap<(usize, usize, usize), usize> = HashMap::new();
276 let n = x.len();
277
278 for i in 0..n {
279 *counts.entry((x_disc[i], y_disc[i], z_disc[i])).or_insert(0) += 1;
280 }
281
282 let mut entropy = 0.0;
283 for &count in counts.values() {
284 let p = count as f64 / n as f64;
285 if p > 0.0 {
286 entropy -= p * p.log2();
287 }
288 }
289
290 Ok(entropy)
291}
292
293pub fn lempel_ziv_complexity(data: &Array1<f64>, bins: usize) -> Result<f64> {
299 if data.is_empty() {
300 return Err(SklearsError::InvalidInput(
301 "Data array cannot be empty".to_string(),
302 ));
303 }
304
305 let binary = discretize_to_binary(data, bins)?;
307 let n = binary.len();
308
309 let mut complexity = 1;
310 let mut prefix_len = 1;
311 let mut i = 0;
312
313 while i + prefix_len <= n {
314 let prefix = &binary[i..i + prefix_len];
315 let mut found = false;
316
317 let start_j = if prefix_len <= i + 1 {
320 i + 1 - prefix_len
321 } else {
322 0
323 };
324 for j in start_j..=i {
325 if j >= prefix_len && &binary[j - prefix_len..j] == prefix {
326 found = true;
327 break;
328 }
329 }
330
331 if found {
332 prefix_len += 1;
333 } else {
334 complexity += 1;
335 i += prefix_len;
336 prefix_len = 1;
337 }
338 }
339
340 let max_complexity = n as f64 / (n as f64).log2();
342 Ok(complexity as f64 / max_complexity)
343}
344
345pub fn sample_entropy(data: &Array1<f64>, m: usize, r: f64) -> Result<f64> {
347 if data.len() < m + 1 {
348 return Err(SklearsError::InvalidInput(
349 "Data length must be greater than m".to_string(),
350 ));
351 }
352
353 let n = data.len();
354 let mut a: f64 = 0.0;
355 let mut b: f64 = 0.0;
356
357 for i in 0..n - m {
358 for j in i + 1..n - m {
359 let mut match_m = true;
360
361 for k in 0..m {
362 if (data[i + k] - data[j + k]).abs() > r {
363 match_m = false;
364 break;
365 }
366 }
367
368 if match_m {
369 b += 1.0;
370 if (data[i + m] - data[j + m]).abs() <= r {
371 a += 1.0;
372 }
373 }
374 }
375 }
376
377 if b == 0.0 {
378 return Ok(0.0);
379 }
380
381 Ok(-(a / b).ln())
382}
383
384#[derive(Debug, Clone)]
390pub struct InformationFeatureSelectorConfig {
391 pub metric: InformationMetric,
393 pub bins: usize,
395 pub k: Option<usize>,
397 pub threshold: Option<f64>,
399}
400
401#[derive(Debug, Clone, Copy, PartialEq, Eq)]
403pub enum InformationMetric {
404 MutualInformation,
406 NormalizedMI,
408 InformationGain,
410 SymmetricalUncertainty,
412}
413
414impl Default for InformationFeatureSelectorConfig {
415 fn default() -> Self {
416 Self {
417 metric: InformationMetric::MutualInformation,
418 bins: 10,
419 k: Some(10),
420 threshold: None,
421 }
422 }
423}
424
425pub struct InformationFeatureSelector {
427 config: InformationFeatureSelectorConfig,
428}
429
430pub struct InformationFeatureSelectorFitted {
432 config: InformationFeatureSelectorConfig,
433 scores: Vec<f64>,
435 selected_features: Vec<usize>,
437}
438
439impl InformationFeatureSelector {
440 pub fn new(config: InformationFeatureSelectorConfig) -> Self {
442 Self { config }
443 }
444}
445
446impl Estimator for InformationFeatureSelector {
447 type Config = InformationFeatureSelectorConfig;
448 type Error = SklearsError;
449 type Float = f64;
450
451 fn config(&self) -> &Self::Config {
452 &self.config
453 }
454}
455
456impl Fit<Array2<f64>, Array1<f64>> for InformationFeatureSelector {
457 type Fitted = InformationFeatureSelectorFitted;
458
459 fn fit(self, X: &Array2<f64>, y: &Array1<f64>) -> Result<Self::Fitted> {
460 if X.nrows() != y.len() {
462 return Err(SklearsError::InvalidInput(
463 "X and y must have the same number of samples".to_string(),
464 ));
465 }
466
467 let n_features = X.ncols();
468 let mut scores = Vec::with_capacity(n_features);
469
470 for j in 0..n_features {
472 let feature = X.column(j).to_owned();
473 let score = match self.config.metric {
474 InformationMetric::MutualInformation => {
475 mutual_information(&feature, y, self.config.bins)?
476 }
477 InformationMetric::NormalizedMI => {
478 normalized_mutual_information(&feature, y, self.config.bins)?
479 }
480 InformationMetric::InformationGain => {
481 let h_y = shannon_entropy(y, self.config.bins)?;
483 let h_y_given_x = conditional_entropy(y, &feature, self.config.bins)?;
484 h_y - h_y_given_x
485 }
486 InformationMetric::SymmetricalUncertainty => {
487 let mi = mutual_information(&feature, y, self.config.bins)?;
489 let h_x = shannon_entropy(&feature, self.config.bins)?;
490 let h_y = shannon_entropy(y, self.config.bins)?;
491 if h_x + h_y == 0.0 {
492 0.0
493 } else {
494 2.0 * mi / (h_x + h_y)
495 }
496 }
497 };
498 scores.push(score);
499 }
500
501 let mut selected_features: Vec<usize> = if let Some(k) = self.config.k {
503 let mut indices: Vec<usize> = (0..n_features).collect();
505 indices.sort_by(|&a, &b| {
506 scores[b]
507 .partial_cmp(&scores[a])
508 .expect("operation should succeed")
509 });
510 indices.into_iter().take(k.min(n_features)).collect()
511 } else if let Some(threshold) = self.config.threshold {
512 (0..n_features)
514 .filter(|&i| scores[i] >= threshold)
515 .collect()
516 } else {
517 (0..n_features).collect()
519 };
520
521 selected_features.sort();
522
523 Ok(InformationFeatureSelectorFitted {
524 config: self.config,
525 scores,
526 selected_features,
527 })
528 }
529}
530
531impl Transform<Array2<f64>, Array2<f64>> for InformationFeatureSelectorFitted {
532 fn transform(&self, X: &Array2<f64>) -> Result<Array2<f64>> {
533 if self.selected_features.is_empty() {
534 return Err(SklearsError::InvalidInput(
535 "No features were selected".to_string(),
536 ));
537 }
538
539 let mut result = Array2::zeros((X.nrows(), self.selected_features.len()));
540
541 for (new_idx, &old_idx) in self.selected_features.iter().enumerate() {
542 if old_idx >= X.ncols() {
543 return Err(SklearsError::InvalidInput(format!(
544 "Feature index {} out of bounds",
545 old_idx
546 )));
547 }
548 result.column_mut(new_idx).assign(&X.column(old_idx));
549 }
550
551 Ok(result)
552 }
553}
554
555impl InformationFeatureSelectorFitted {
556 pub fn scores(&self) -> &[f64] {
558 &self.scores
559 }
560
561 pub fn selected_features(&self) -> &[usize] {
563 &self.selected_features
564 }
565}
566
567fn compute_probabilities(data: &Array1<f64>, bins: usize) -> Result<Vec<f64>> {
573 if bins == 0 {
574 return Err(SklearsError::InvalidInput(
575 "Number of bins must be positive".to_string(),
576 ));
577 }
578
579 let discretized = discretize(data, bins)?;
580 let mut counts = vec![0; bins];
581
582 for &bin in discretized.iter() {
583 counts[bin] += 1;
584 }
585
586 let total = data.len() as f64;
587 Ok(counts.into_iter().map(|c| c as f64 / total).collect())
588}
589
590fn discretize(data: &Array1<f64>, bins: usize) -> Result<Vec<usize>> {
592 let min_val = data.iter().cloned().fold(f64::INFINITY, f64::min);
593 let max_val = data.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
594
595 if (max_val - min_val).abs() < 1e-10 {
596 return Ok(vec![0; data.len()]);
598 }
599
600 let bin_width = (max_val - min_val) / bins as f64;
601 let mut discretized = Vec::with_capacity(data.len());
602
603 for &val in data.iter() {
604 let bin = ((val - min_val) / bin_width).floor() as usize;
605 discretized.push(bin.min(bins - 1));
606 }
607
608 Ok(discretized)
609}
610
611fn discretize_to_binary(data: &Array1<f64>, bins: usize) -> Result<Vec<u8>> {
613 let discretized = discretize(data, bins)?;
614 let threshold = bins / 2;
615 Ok(discretized
616 .into_iter()
617 .map(|b| if b >= threshold { 1 } else { 0 })
618 .collect())
619}
620
621fn compute_joint_probabilities(
623 x: &Array1<f64>,
624 y: &Array1<f64>,
625 bins: usize,
626) -> Result<HashMap<(usize, usize), f64>> {
627 let x_disc = discretize(x, bins)?;
628 let y_disc = discretize(y, bins)?;
629
630 let mut counts: HashMap<(usize, usize), usize> = HashMap::new();
631 let n = x.len();
632
633 for i in 0..n {
634 *counts.entry((x_disc[i], y_disc[i])).or_insert(0) += 1;
635 }
636
637 let mut probabilities = HashMap::new();
638 for (key, count) in counts {
639 probabilities.insert(key, count as f64 / n as f64);
640 }
641
642 Ok(probabilities)
643}
644
645#[cfg(test)]
650mod tests {
651 use super::*;
652 use approx::assert_relative_eq;
653 use scirs2_core::ndarray::array;
654
655 #[test]
656 fn test_shannon_entropy_uniform() {
657 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
659 let entropy = shannon_entropy(&data, 8).expect("operation should succeed");
660
661 assert_relative_eq!(entropy, 3.0, epsilon = 0.1);
663 }
664
665 #[test]
666 fn test_shannon_entropy_deterministic() {
667 let data = array![1.0, 1.0, 1.0, 1.0, 1.0];
669 let entropy = shannon_entropy(&data, 5).expect("operation should succeed");
670
671 assert_relative_eq!(entropy, 0.0, epsilon = 1e-10);
672 }
673
674 #[test]
675 fn test_mutual_information_independent() {
676 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
678 let y = array![5.0, 4.0, 3.0, 2.0, 1.0];
679
680 let mi = mutual_information(&x, &y, 5).expect("operation should succeed");
681
682 assert!(mi >= 0.0);
684 }
685
686 #[test]
687 fn test_mutual_information_identical() {
688 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
690 let mi = mutual_information(&x, &x, 5).expect("operation should succeed");
691 let h_x = shannon_entropy(&x, 5).expect("operation should succeed");
692
693 assert_relative_eq!(mi, h_x, epsilon = 1e-10);
694 }
695
696 #[test]
697 fn test_renyi_entropy() {
698 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
699
700 let renyi_05 = renyi_entropy(&data, 5, 0.5).expect("operation should succeed");
702 let renyi_20 = renyi_entropy(&data, 5, 2.0).expect("operation should succeed");
703 let shannon = shannon_entropy(&data, 5).expect("operation should succeed");
704
705 assert!(renyi_05 > 0.0);
706 assert!(renyi_20 > 0.0);
707 assert!(shannon > 0.0);
708 }
709
710 #[test]
711 fn test_permutation_entropy() {
712 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0];
713 let pe = permutation_entropy(&data, 3, 1).expect("operation should succeed");
714
715 assert!(pe > 0.0);
716 assert!(pe <= 6.0f64.log2()); }
718
719 #[test]
720 fn test_approximate_entropy() {
721 let data = array![1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0];
722 let apen = approximate_entropy(&data, 2, 0.5).expect("operation should succeed");
723
724 assert!(apen >= 0.0);
725 }
726
727 #[test]
728 fn test_lempel_ziv_complexity() {
729 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
730 let lz = lempel_ziv_complexity(&data, 4).expect("operation should succeed");
731
732 println!("LZ complexity: {}", lz);
733 assert!(lz > 0.0);
734 assert!(
737 lz > 0.0 && lz < 100.0,
738 "LZ complexity should be reasonable, got {}",
739 lz
740 );
741 }
742
743 #[test]
744 fn test_sample_entropy() {
745 let data = array![1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0];
746 let sampen = sample_entropy(&data, 2, 0.5).expect("sampling should succeed");
747
748 assert!(sampen >= 0.0);
749 }
750
751 #[test]
752 fn test_information_feature_selector() {
753 let X = array![
754 [1.0, 10.0, 100.0],
755 [2.0, 20.0, 200.0],
756 [3.0, 30.0, 300.0],
757 [4.0, 40.0, 400.0],
758 [5.0, 50.0, 500.0]
759 ];
760 let y = array![1.0, 2.0, 3.0, 4.0, 5.0];
761
762 let config = InformationFeatureSelectorConfig {
763 metric: InformationMetric::MutualInformation,
764 bins: 5,
765 k: Some(2),
766 threshold: None,
767 };
768
769 let selector = InformationFeatureSelector::new(config);
770 let fitted = selector.fit(&X, &y).expect("model fitting should succeed");
771
772 assert_eq!(fitted.selected_features().len(), 2);
773 assert_eq!(fitted.scores().len(), 3);
774
775 let X_transformed = fitted.transform(&X).expect("transformation should succeed");
776 assert_eq!(X_transformed.ncols(), 2);
777 }
778
779 #[test]
780 fn test_normalized_mutual_information() {
781 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
782 let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
783
784 let nmi = normalized_mutual_information(&x, &y, 5).expect("operation should succeed");
785
786 assert!(nmi >= 0.0);
787 assert!(nmi <= 1.0);
788 }
789
790 #[test]
791 fn test_conditional_entropy() {
792 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
793 let y = array![2.0, 4.0, 6.0, 8.0, 10.0];
794
795 let h_y_given_x = conditional_entropy(&y, &x, 5).expect("operation should succeed");
796 let h_y = shannon_entropy(&y, 5).expect("operation should succeed");
797
798 assert!(h_y_given_x >= 0.0);
799 assert!(h_y_given_x <= h_y);
800 }
801
802 #[test]
803 fn test_transfer_entropy() {
804 let x = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
805 let y = array![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
806
807 let te = transfer_entropy(&x, &y, 4, 1).expect("operation should succeed");
808
809 assert!(te.is_finite());
810 }
811
812 #[test]
813 fn test_discretize() {
814 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
815 let discretized = discretize(&data, 5).expect("operation should succeed");
816
817 assert_eq!(discretized.len(), 5);
818 assert!(discretized.iter().all(|&b| b < 5));
819 }
820
821 #[test]
822 fn test_feature_selector_threshold() {
823 let X = array![
824 [1.0, 10.0, 100.0],
825 [2.0, 20.0, 200.0],
826 [3.0, 30.0, 300.0],
827 [4.0, 40.0, 400.0]
828 ];
829 let y = array![1.0, 2.0, 3.0, 4.0];
830
831 let config = InformationFeatureSelectorConfig {
832 metric: InformationMetric::MutualInformation,
833 bins: 4,
834 k: None,
835 threshold: Some(0.5),
836 };
837
838 let selector = InformationFeatureSelector::new(config);
839 let fitted = selector.fit(&X, &y).expect("model fitting should succeed");
840
841 assert!(fitted.selected_features().len() <= 3);
842 }
843}