Skip to main content

sdivi_patterns/
entropy.rs

1//! Per-category normalized Shannon entropy for pattern catalogs.
2
3use std::collections::BTreeMap;
4
5use crate::catalog::PatternStats;
6use crate::fingerprint::PatternFingerprint;
7
8/// Computes normalized Shannon entropy for the pattern statistics within one category.
9///
10/// # Formula
11///
12/// ```text
13/// H = -sum(p_i * log2(p_i)) / log2(N)
14/// ```
15///
16/// where:
17/// - `p_i = count_i / total_instances` for each distinct fingerprint
18/// - `N` = number of distinct fingerprints
19/// - Result is bounded to `[0, 1]`
20/// - Returns `0.0` when `N <= 1` (only one distinct shape, or no instances)
21///
22/// # Examples
23///
24/// ```rust
25/// use std::collections::BTreeMap;
26/// use sdivi_patterns::fingerprint::fingerprint_node_kind;
27/// use sdivi_patterns::catalog::PatternStats;
28/// use sdivi_patterns::entropy::compute_entropy;
29///
30/// let mut stats = BTreeMap::new();
31/// let fp1 = fingerprint_node_kind("try_expression");
32/// let fp2 = fingerprint_node_kind("match_expression");
33/// stats.insert(fp1, PatternStats { count: 5, locations: vec![] });
34/// stats.insert(fp2, PatternStats { count: 5, locations: vec![] });
35///
36/// // Equal distribution across 2 shapes → maximum entropy = 1.0
37/// let h = compute_entropy(&stats);
38/// assert!((h - 1.0).abs() < 1e-10);
39/// ```
40pub fn compute_entropy(stats: &BTreeMap<PatternFingerprint, PatternStats>) -> f64 {
41    let distinct = stats.len();
42    if distinct <= 1 {
43        return 0.0;
44    }
45
46    let total: u32 = stats.values().map(|s| s.count).sum();
47    if total == 0 {
48        return 0.0;
49    }
50
51    let total_f = f64::from(total);
52    let raw: f64 = stats
53        .values()
54        .map(|s| {
55            let p = f64::from(s.count) / total_f;
56            if p > 0.0 {
57                -p * p.log2()
58            } else {
59                0.0
60            }
61        })
62        .sum();
63
64    let max = (distinct as f64).log2();
65    if max <= 0.0 {
66        return 0.0;
67    }
68
69    (raw / max).clamp(0.0, 1.0)
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75    use crate::fingerprint::fingerprint_node_kind;
76
77    fn make_stats(count: u32) -> PatternStats {
78        PatternStats {
79            count,
80            locations: vec![],
81        }
82    }
83
84    #[test]
85    fn empty_stats_returns_zero() {
86        let stats: BTreeMap<PatternFingerprint, PatternStats> = BTreeMap::new();
87        assert_eq!(compute_entropy(&stats), 0.0);
88    }
89
90    #[test]
91    fn single_shape_returns_zero() {
92        let mut stats = BTreeMap::new();
93        stats.insert(fingerprint_node_kind("try_expression"), make_stats(10));
94        assert_eq!(compute_entropy(&stats), 0.0);
95    }
96
97    #[test]
98    fn equal_distribution_two_shapes_returns_one() {
99        let mut stats = BTreeMap::new();
100        stats.insert(fingerprint_node_kind("try_expression"), make_stats(5));
101        stats.insert(fingerprint_node_kind("match_expression"), make_stats(5));
102        let h = compute_entropy(&stats);
103        assert!((h - 1.0).abs() < 1e-10);
104    }
105
106    #[test]
107    fn unequal_distribution_is_between_zero_and_one() {
108        let mut stats = BTreeMap::new();
109        stats.insert(fingerprint_node_kind("try_expression"), make_stats(9));
110        stats.insert(fingerprint_node_kind("match_expression"), make_stats(1));
111        let h = compute_entropy(&stats);
112        assert!(h > 0.0 && h < 1.0);
113    }
114}