use std::collections::BTreeMap;
use crate::catalog::PatternStats;
use crate::fingerprint::PatternFingerprint;
pub fn compute_entropy(stats: &BTreeMap<PatternFingerprint, PatternStats>) -> f64 {
let distinct = stats.len();
if distinct <= 1 {
return 0.0;
}
let total: u32 = stats.values().map(|s| s.count).sum();
if total == 0 {
return 0.0;
}
let total_f = f64::from(total);
let raw: f64 = stats
.values()
.map(|s| {
let p = f64::from(s.count) / total_f;
if p > 0.0 {
-p * p.log2()
} else {
0.0
}
})
.sum();
let max = (distinct as f64).log2();
if max <= 0.0 {
return 0.0;
}
(raw / max).clamp(0.0, 1.0)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fingerprint::fingerprint_node_kind;
fn make_stats(count: u32) -> PatternStats {
PatternStats {
count,
locations: vec![],
}
}
#[test]
fn empty_stats_returns_zero() {
let stats: BTreeMap<PatternFingerprint, PatternStats> = BTreeMap::new();
assert_eq!(compute_entropy(&stats), 0.0);
}
#[test]
fn single_shape_returns_zero() {
let mut stats = BTreeMap::new();
stats.insert(fingerprint_node_kind("try_expression"), make_stats(10));
assert_eq!(compute_entropy(&stats), 0.0);
}
#[test]
fn equal_distribution_two_shapes_returns_one() {
let mut stats = BTreeMap::new();
stats.insert(fingerprint_node_kind("try_expression"), make_stats(5));
stats.insert(fingerprint_node_kind("match_expression"), make_stats(5));
let h = compute_entropy(&stats);
assert!((h - 1.0).abs() < 1e-10);
}
#[test]
fn unequal_distribution_is_between_zero_and_one() {
let mut stats = BTreeMap::new();
stats.insert(fingerprint_node_kind("try_expression"), make_stats(9));
stats.insert(fingerprint_node_kind("match_expression"), make_stats(1));
let h = compute_entropy(&stats);
assert!(h > 0.0 && h < 1.0);
}
}