use mnem_core::id::NodeId;
use serde::{Deserialize, Serialize};
pub const WILSON_Z: f32 = 1.96;
pub const WILSON_WIDTH_TARGET: f32 = 0.18;
pub const K_MIN: usize = 8;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum ShapeLabel {
LongTail,
Uniform,
Bimodal,
InsufficientSamples,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ScoreDistribution {
pub min: f32,
pub max: f32,
pub median: f32,
pub iqr: f32,
pub shape: ShapeLabel,
}
impl Default for ScoreDistribution {
fn default() -> Self {
Self {
min: 0.0,
max: 0.0,
median: 0.0,
iqr: 0.0,
shape: ShapeLabel::InsufficientSamples,
}
}
}
#[must_use]
pub fn score_quantiles<N>(ranked: &[(N, f32)]) -> Vec<f32> {
let k = ranked.len();
if k == 0 {
return Vec::new();
}
if k == 1 {
return vec![1.0];
}
let denom = (k - 1) as f32;
(0..k).map(|i| (k - 1 - i) as f32 / denom).collect()
}
#[must_use]
pub fn node_score_quantiles(ranked: &[(NodeId, f32)]) -> Vec<f32> {
score_quantiles(ranked)
}
#[must_use]
pub fn distribution_shape(scores: &[f32], gate: usize) -> ScoreDistribution {
if scores.len() < gate {
return ScoreDistribution::default();
}
let mut sorted: Vec<f32> = scores.to_vec();
sorted.sort_by(f32::total_cmp);
let n = sorted.len();
let min = sorted[0];
let max = sorted[n - 1];
let median = sorted[n / 2];
let q1 = sorted[n / 4];
let q3 = sorted[(3 * n) / 4];
let iqr = (q3 - q1).max(f32::EPSILON);
let range = max - min;
let shape = if bimodal_gap(&sorted, range) {
ShapeLabel::Bimodal
} else if (max - median) > 2.0 * iqr {
ShapeLabel::LongTail
} else {
ShapeLabel::Uniform
};
ScoreDistribution {
min,
max,
median,
iqr,
shape,
}
}
fn bimodal_gap(sorted: &[f32], range: f32) -> bool {
if sorted.len() < 4 || range <= f32::EPSILON {
return false;
}
let threshold = range * 0.5;
for (i, w) in sorted.windows(2).enumerate() {
let below = i + 1;
let above = sorted.len() - i - 1;
if below < 2 || above < 2 {
continue;
}
let gap = w[1] - w[0];
if gap > threshold {
return true;
}
}
false
}
#[must_use]
pub fn derive_k_min(width_target: f32) -> usize {
let eps = if width_target.is_finite() && width_target > 0.0 {
width_target
} else {
f32::EPSILON
};
let raw = (WILSON_Z / eps).powi(2) * 0.25;
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let k = raw.ceil() as usize;
k.max(1)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn k_min_default_is_8() {
assert_eq!(K_MIN, 8);
assert_eq!(derive_k_min(0.35), 8);
}
#[test]
fn width_target_tunable_in_principled_range() {
assert!((0.10..=0.35).contains(&WILSON_WIDTH_TARGET));
assert_eq!(derive_k_min(0.10), 97);
assert_eq!(derive_k_min(WILSON_WIDTH_TARGET), 30);
assert_eq!(derive_k_min(0.30), 11);
assert!(derive_k_min(0.10) > derive_k_min(0.18));
assert!(derive_k_min(0.18) > derive_k_min(0.30));
}
#[test]
fn derive_k_min_never_zero() {
assert!(derive_k_min(100.0) >= 1);
assert!(derive_k_min(0.0) >= 1);
assert!(derive_k_min(-1.0) >= 1);
assert!(derive_k_min(f32::NAN) >= 1);
}
#[test]
fn quantile_monotone() {
let ranked: Vec<(u32, f32)> = vec![(0, 0.9), (1, 0.7), (2, 0.5), (3, 0.3), (4, 0.1)];
let q = score_quantiles(&ranked);
assert_eq!(q, vec![1.0, 0.75, 0.5, 0.25, 0.0]);
assert!(q.windows(2).all(|w| w[0] >= w[1]));
}
#[test]
fn quantile_top_is_one_bottom_is_zero() {
let ranked: Vec<(u32, f32)> = (0..10).map(|i| (i, 1.0 - (i as f32) * 0.1)).collect();
let q = score_quantiles(&ranked);
assert!((q[0] - 1.0).abs() < f32::EPSILON);
assert!((q[9] - 0.0).abs() < f32::EPSILON);
}
#[test]
fn quantile_edge_cases() {
let empty: Vec<(u32, f32)> = vec![];
assert!(score_quantiles(&empty).is_empty());
let one: Vec<(u32, f32)> = vec![(0, 0.42)];
assert_eq!(score_quantiles(&one), vec![1.0]);
}
#[test]
fn quantile_scale_invariance() {
let ranked_a: Vec<(u32, f32)> = vec![(0, 0.9), (1, 0.5), (2, 0.1)];
let ranked_b: Vec<(u32, f32)> = vec![(0, 90.0), (1, 50.0), (2, 10.0)];
assert_eq!(score_quantiles(&ranked_a), score_quantiles(&ranked_b));
}
#[test]
fn shape_long_tail_when_top_score_dominates() {
let scores = vec![0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.95];
let dist = distribution_shape(&scores, 8);
assert_eq!(dist.shape, ShapeLabel::LongTail);
assert!((dist.max - 0.95).abs() < 1e-5);
assert!(dist.min >= 0.0);
}
#[test]
fn shape_uniform() {
let scores: Vec<f32> = (0..10).map(|i| i as f32 * 0.1).collect();
let dist = distribution_shape(&scores, K_MIN);
assert_eq!(dist.shape, ShapeLabel::Uniform);
}
#[test]
fn shape_bimodal() {
let scores = vec![0.10, 0.11, 0.12, 0.13, 0.14, 0.80, 0.81, 0.82, 0.83, 0.84];
let dist = distribution_shape(&scores, K_MIN);
assert_eq!(dist.shape, ShapeLabel::Bimodal);
}
#[test]
fn shape_insufficient_samples_below_gate() {
let scores = vec![0.1, 0.5, 0.9];
let dist = distribution_shape(&scores, K_MIN);
assert_eq!(dist.shape, ShapeLabel::InsufficientSamples);
assert_eq!(dist.min, 0.0);
assert_eq!(dist.max, 0.0);
}
#[test]
fn shape_all_equal_is_uniform_not_nan() {
let scores = vec![0.5_f32; 8];
let dist = distribution_shape(&scores, K_MIN);
assert_eq!(dist.shape, ShapeLabel::Uniform);
assert!(dist.iqr > 0.0); }
#[test]
fn scale_free_across_response_sizes() {
for &k in &[8_usize, 1000] {
let ranked: Vec<(u32, f32)> = (0..k)
.map(|i| (i as u32, 1.0 - (i as f32) / (k as f32)))
.collect();
let q = score_quantiles(&ranked);
assert_eq!(q.len(), k);
assert!((q[0] - 1.0).abs() < 1e-5);
assert!((q[k - 1] - 0.0).abs() < 1e-5);
let scores: Vec<f32> = ranked.iter().map(|(_, s)| *s).collect();
let dist = distribution_shape(&scores, K_MIN);
assert_eq!(dist.shape, ShapeLabel::Uniform);
}
}
use proptest::prelude::*;
proptest! {
#[test]
fn determinism(
xs in proptest::collection::vec(-1000.0_f32..1000.0, 0..200)
) {
let xs: Vec<f32> = xs.into_iter().filter(|v| v.is_finite()).collect();
let ranked: Vec<(u32, f32)> =
xs.iter().enumerate().map(|(i, v)| (i as u32, *v)).collect();
let q1 = score_quantiles(&ranked);
let q2 = score_quantiles(&ranked);
prop_assert_eq!(&q1, &q2);
let d1 = distribution_shape(&xs, K_MIN);
let d2 = distribution_shape(&xs, K_MIN);
prop_assert_eq!(d1, d2);
}
#[test]
fn shape_scale_invariant(
xs in proptest::collection::vec(0.0_f32..100.0, 8..64),
scale in 0.01_f32..1000.0,
) {
let xs: Vec<f32> = xs.into_iter().filter(|v| v.is_finite()).collect();
if xs.len() < K_MIN {
return Ok(());
}
let scaled: Vec<f32> = xs.iter().map(|v| v * scale).collect();
let d1 = distribution_shape(&xs, K_MIN);
let d2 = distribution_shape(&scaled, K_MIN);
prop_assert_eq!(d1.shape, d2.shape);
}
}
}