mnem_graphrag/confidence.rs
1//! Within-query confidence signals over retrieval score distributions
2//! (Gap 05, LD-primary).
3//!
4//! All signals are pure functions of the returned top-K scores. No
5//! globals, no rolling state, no I/O. Given the same input, two
6//! independent calls produce byte-identical output.
7//!
8//! # Design
9//!
10//! Given a descending-sorted score vector `[s1, s2, ..., sK]`:
11//!
12//! - [`normalized_entropy`]: Shannon entropy of a softmax over the
13//! min-max-rescaled scores, divided by `ln(K)`. Dimensionless,
14//! living in `[0, 1]`. `0` = one score dominates; `1` = all equal.
15//! - [`median_topk_margin_pct`]: median of the per-adjacent-pair
16//! relative gaps `(s_i - s_{i+1}) / max(s_i, eps)` over the top-K.
17//! The *within-query baseline*: is the top-1 lead unusual relative
18//! to the rest of this ranking?
19//! - [`rank_agreement`]: categorical label (`High` / `Medium` / `Low`)
20//! derived from the two metrics above. Auto-adapts per-query; no
21//! hardcoded global threshold.
22//!
23//! # Why within-query
24//!
25//! Cross-query calibration (Gap 01) needs rolling-median state. This
26//! gap is *strictly within-query* and needs none of that plumbing.
27//! Consumers that want the hop-suggestion signal (Gap 01) combine
28//! [`RankAgreement`] with their rolling median externally.
29
30use serde::{Deserialize, Serialize};
31
32/// Minimum K for a statistically meaningful `rank_agreement` bucket.
33///
34/// At `K < 5` the within-query median of `K-1` adjacent margins is
35/// under-powered (Wilson 95% CI width > 0.45 at p=0.5). Callers with
36/// fewer items should treat the signal as insufficient.
37pub const K_MIN_SHAPE_GATE: usize = 5;
38
39/// Tiny epsilon used to avoid division by zero when the top score or a
40/// per-pair denominator is itself near zero.
41const EPS: f32 = 1e-9;
42
43/// Categorical confidence label for a returned top-K ranking.
44///
45/// Derived from the softmax-entropy and the within-query median
46/// adjacent margin. The mapping is intentionally coarse: the three-
47/// bucket view is what downstream consumers (Gap 01 `suggest_hop`, UI
48/// badges) actually use. The richer four-label string form from the
49/// solution design (confident / likely / tie / flat) is preserved in
50/// [`RankAgreement::as_fine_label`] for telemetry.
51#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq, Eq, Hash)]
52#[serde(rename_all = "lowercase")]
53pub enum RankAgreement {
54 /// The top-1 lead is unusually large relative to the rest of the
55 /// ranking *and* entropy is low. Downstream: do not suggest a hop.
56 /// Fine label: `confident`.
57 High,
58 /// The top-1 lead beats the within-query median margin but does
59 /// not meet the stricter entropy floor. Fine label: `likely`.
60 Medium,
61 /// Top-K is near-tied or flat. Downstream: Gap 01 `suggest_hop`
62 /// should fire (subject to its graph-size gate). Fine labels:
63 /// `tie` (top is near-tied with runners-up) and `flat` (entire
64 /// distribution is near-uniform).
65 Low,
66 /// `K < K_MIN_SHAPE_GATE`: sample too small to label meaningfully.
67 /// Consumers should treat this as "no signal".
68 Insufficient,
69}
70
71impl RankAgreement {
72 /// Stable fine-grained label for telemetry (`confident` / `likely`
73 /// / `tie` / `flat` / `insufficient_k`). The coarse three-bucket
74 /// [`RankAgreement`] variant is what programmatic consumers should
75 /// match on; this string is for dashboards and logs.
76 #[must_use]
77 pub const fn as_fine_label(self) -> &'static str {
78 match self {
79 Self::High => "confident",
80 Self::Medium => "likely",
81 Self::Low => "flat",
82 Self::Insufficient => "insufficient_k",
83 }
84 }
85}
86
87/// Compute the normalized Shannon entropy of a top-K score vector.
88///
89/// Scores are first shifted to be non-negative (by subtracting the
90/// minimum, which is scale-invariant under affine score transforms
91/// that preserve ordering) and then L1-normalized into a probability
92/// distribution. Shannon entropy of that distribution divided by
93/// `ln(K)` yields a dimensionless value in `[0, 1]`:
94///
95/// - `0.0`: a single score dominates (one-hot distribution).
96/// - `1.0`: all scores are equal (uniform distribution).
97///
98/// We do **not** use a softmax: for score ranges that are small in
99/// absolute terms, `exp` damps differences and the distribution
100/// collapses towards uniform even when one score is clearly peaked.
101/// L1-normalization preserves the relative magnitude structure that
102/// consumers of retrieval scores actually care about.
103///
104/// Returns `0.0` for `scores.len() < 2` (no meaningful distribution).
105/// Non-finite inputs (`NaN`, `+inf`, `-inf`) are treated as the score
106/// minimum so they do not poison the distribution.
107#[must_use]
108pub fn normalized_entropy(scores: &[f32]) -> f32 {
109 let k = scores.len();
110 if k < 2 {
111 return 0.0;
112 }
113
114 // Sanitize: replace non-finite entries with the finite minimum.
115 let finite_min = scores
116 .iter()
117 .copied()
118 .filter(|s| s.is_finite())
119 .fold(f32::INFINITY, f32::min);
120 let finite_min = if finite_min.is_finite() {
121 finite_min
122 } else {
123 0.0
124 };
125
126 let sanitized: Vec<f32> = scores
127 .iter()
128 .map(|&s| if s.is_finite() { s } else { finite_min })
129 .collect();
130
131 // Shift so min -> 0. Sum is scale-free in the sense that scaling
132 // every score by a positive constant produces proportional shifts.
133 let min = sanitized.iter().copied().fold(f32::INFINITY, f32::min);
134 let shifted: Vec<f32> = sanitized.iter().map(|&s| (s - min).max(0.0)).collect();
135 let sum: f32 = shifted.iter().sum();
136
137 // All scores equal => uniform distribution => entropy 1.0.
138 if sum <= EPS {
139 return 1.0;
140 }
141
142 let mut entropy = 0.0_f32;
143 for &x in &shifted {
144 let p = x / sum;
145 if p > EPS {
146 entropy -= p * p.ln();
147 }
148 }
149 let denom = (k as f32).ln().max(EPS);
150 (entropy / denom).clamp(0.0, 1.0)
151}
152
153/// Median of the per-adjacent-pair relative gaps over a top-K vector.
154///
155/// For an input `[s1, s2, ..., sK]` (assumed descending), returns
156/// `median_i { (s_i - s_{i+1}) / max(s_i, eps) }` over the `K-1`
157/// adjacent pairs, optionally trimmed to the first `k` pairs if `k`
158/// is smaller than `K-1`. When `k == 0` or the vector has fewer than
159/// 2 elements, returns `0.0`.
160///
161/// This is the *within-query baseline*: "how wide is a typical gap in
162/// this specific ranking?" [`rank_agreement`] uses it as the anchor
163/// against which the top-1 margin is compared.
164#[must_use]
165pub fn median_topk_margin_pct(scores: &[f32], k: usize) -> f32 {
166 if scores.len() < 2 || k == 0 {
167 return 0.0;
168 }
169 let pair_limit = (scores.len() - 1).min(k);
170 let mut pair_pcts: Vec<f32> = (0..pair_limit)
171 .map(|i| {
172 let num = scores[i] - scores[i + 1];
173 let denom = scores[i].abs().max(EPS);
174 (num / denom).max(0.0)
175 })
176 .collect();
177 if pair_pcts.is_empty() {
178 return 0.0;
179 }
180 // Sort with a total order: NaN-safe via partial_cmp fallback.
181 pair_pcts.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
182 let mid = pair_pcts.len() / 2;
183 if pair_pcts.len() % 2 == 0 {
184 0.5 * (pair_pcts[mid - 1] + pair_pcts[mid])
185 } else {
186 pair_pcts[mid]
187 }
188}
189
190/// Categorical confidence label for a top-K score vector.
191///
192/// See [`RankAgreement`] for the bucket semantics. The derivation is
193/// a pure function of the score distribution; no global thresholds.
194///
195/// # Algorithm
196///
197/// Let `m1 = (s1 - s2) / max(s1, eps)` be the top-1 relative margin,
198/// `mu = median_topk_margin_pct(scores, scores.len() - 1)` be the
199/// within-query baseline, and `h = normalized_entropy(scores)`.
200///
201/// - `High` when `m1 >= 2 * mu` and `h < 1 - mu` (top-1 is decisively
202/// ahead of the typical gap *and* the distribution is peaked).
203/// - `Medium` when `m1 > mu` (top-1 beats baseline but entropy is
204/// not tight enough for `High`).
205/// - `Low` when the ranking is near-tied (`m1 < mu / 4` with a
206/// still-positive top score) or generally flat.
207/// - `Insufficient` when `scores.len() < K_MIN_SHAPE_GATE` (5).
208#[must_use]
209pub fn rank_agreement(scores: &[f32]) -> RankAgreement {
210 if scores.len() < K_MIN_SHAPE_GATE {
211 return RankAgreement::Insufficient;
212 }
213
214 let s1 = scores[0];
215 let s2 = scores[1];
216 let s_last = *scores.last().expect("len >= K_MIN_SHAPE_GATE >= 5");
217
218 let top1_margin_pct = if s1.abs() > EPS {
219 ((s1 - s2) / s1).max(0.0)
220 } else {
221 0.0
222 };
223
224 let median_margin = median_topk_margin_pct(scores, scores.len() - 1);
225 let norm_entropy = normalized_entropy(scores);
226
227 // Degenerate flat: every score equal (or near-equal). No useful
228 // signal; we are in the Low bucket regardless of top1_margin_pct.
229 // Note: s_last is read to keep the name live and document the
230 // shape of the flat branch (s1 == s_last iff uniform).
231 let _ = s_last;
232 if median_margin < EPS {
233 return RankAgreement::Low;
234 }
235
236 // Tight: top-1 decisively ahead *and* distribution is non-uniform.
237 // The entropy floor is expressed as `1 - 2*median_margin` so a
238 // query with larger-than-typical gaps can carry High even when the
239 // softmax tail damps entropy towards 1.
240 if top1_margin_pct >= 2.0 * median_margin
241 && norm_entropy < (1.0 - 2.0 * median_margin).clamp(0.0, 0.999)
242 {
243 return RankAgreement::High;
244 }
245
246 // Top-1 beats the within-query baseline: Medium.
247 if top1_margin_pct > median_margin {
248 return RankAgreement::Medium;
249 }
250
251 // Otherwise: flat / tie.
252 RankAgreement::Low
253}
254
255// ============================================================
256// Tests
257// ============================================================
258
259#[cfg(test)]
260mod tests {
261 use super::*;
262 use proptest::prelude::*;
263
264 #[test]
265 fn empty_scores_return_insufficient() {
266 assert_eq!(rank_agreement(&[]), RankAgreement::Insufficient);
267 assert!((normalized_entropy(&[]) - 0.0).abs() < 1e-6);
268 assert!((median_topk_margin_pct(&[], 5) - 0.0).abs() < 1e-6);
269 }
270
271 #[test]
272 fn single_score_returns_insufficient() {
273 assert_eq!(rank_agreement(&[0.9]), RankAgreement::Insufficient);
274 assert!((normalized_entropy(&[0.9]) - 0.0).abs() < 1e-6);
275 }
276
277 #[test]
278 fn four_scores_return_insufficient_k_label() {
279 // K < K_MIN_SHAPE_GATE (5)
280 let scores = [0.9, 0.3, 0.1, 0.05];
281 assert_eq!(rank_agreement(&scores), RankAgreement::Insufficient);
282 assert_eq!(
283 RankAgreement::Insufficient.as_fine_label(),
284 "insufficient_k"
285 );
286 }
287
288 #[test]
289 fn confident_peaked_distribution_yields_high() {
290 // Top-1 dominates: large margin, low entropy.
291 let scores = [0.95, 0.30, 0.10, 0.05, 0.02];
292 let label = rank_agreement(&scores);
293 assert!(
294 matches!(label, RankAgreement::High | RankAgreement::Medium),
295 "expected peaked top-1 to be High or Medium, got {label:?}"
296 );
297 }
298
299 #[test]
300 fn confident_triggers_high_on_sharp_peak() {
301 // Deliberately constructed so m1 >> 2*mu and entropy is low.
302 let scores = [0.99, 0.20, 0.18, 0.16, 0.15, 0.14];
303 assert_eq!(rank_agreement(&scores), RankAgreement::High);
304 }
305
306 #[test]
307 fn tie_near_uniform_yields_low() {
308 let scores = [0.80, 0.80, 0.80, 0.80, 0.79];
309 let label = rank_agreement(&scores);
310 assert_eq!(label, RankAgreement::Low);
311 }
312
313 #[test]
314 fn flat_distribution_yields_low() {
315 // All approximately equal: entropy near 1.0, no standout.
316 let scores = [0.50, 0.50, 0.50, 0.50, 0.50];
317 assert_eq!(rank_agreement(&scores), RankAgreement::Low);
318 }
319
320 #[test]
321 fn confidence_is_deterministic() {
322 let scores = [0.91, 0.60, 0.42, 0.30, 0.15, 0.05];
323 let a = (
324 rank_agreement(&scores),
325 normalized_entropy(&scores),
326 median_topk_margin_pct(&scores, 5),
327 );
328 let b = (
329 rank_agreement(&scores),
330 normalized_entropy(&scores),
331 median_topk_margin_pct(&scores, 5),
332 );
333 assert_eq!(a.0, b.0);
334 assert!((a.1 - b.1).abs() < 1e-7);
335 assert!((a.2 - b.2).abs() < 1e-7);
336 }
337
338 #[test]
339 fn margin_pct_scale_free_and_label_unchanged_under_rescale() {
340 // Scale-invariance property: multiplying every score by the
341 // same positive constant must not change the categorical label.
342 let scores = [0.91, 0.60, 0.42, 0.30, 0.15, 0.05];
343 let scaled: Vec<f32> = scores.iter().map(|s| s * 10.0).collect();
344 assert_eq!(rank_agreement(&scores), rank_agreement(&scaled));
345 let mu_a = median_topk_margin_pct(&scores, 5);
346 let mu_b = median_topk_margin_pct(&scaled, 5);
347 assert!(
348 (mu_a - mu_b).abs() < 1e-5,
349 "median margin pct should be scale-free: got {mu_a} vs {mu_b}"
350 );
351 }
352
353 #[test]
354 fn normalized_entropy_uniform_is_one() {
355 let scores = [0.5, 0.5, 0.5, 0.5, 0.5];
356 let h = normalized_entropy(&scores);
357 assert!((h - 1.0).abs() < 1e-5, "expected 1.0, got {h}");
358 }
359
360 #[test]
361 fn normalized_entropy_one_hot_is_low() {
362 let scores = [1.0, 0.0, 0.0, 0.0, 0.0];
363 let h = normalized_entropy(&scores);
364 assert!(
365 h < 1.0,
366 "one-hot distribution should have sub-uniform entropy, got {h}"
367 );
368 }
369
370 #[test]
371 fn nonfinite_inputs_do_not_panic() {
372 let scores = [f32::NAN, 0.8, 0.5, 0.2, 0.0];
373 let _ = rank_agreement(&scores);
374 let _ = normalized_entropy(&scores);
375 let _ = median_topk_margin_pct(&scores, 4);
376 }
377
378 // ============================================================
379 // Property tests (derived from code-sketch named tests)
380 // ============================================================
381
382 proptest! {
383 /// margin_pct_scale_free: scaling all scores by a positive
384 /// constant must leave the median_topk_margin_pct baseline
385 /// approximately unchanged (scale-free derivation). Labels
386 /// can flip at the Medium/Low boundary when the top-1 margin
387 /// equals the median baseline exactly; we assert the
388 /// numerical baseline instead, which is the load-bearing
389 /// invariant.
390 #[test]
391 fn proptest_margin_pct_scale_free(
392 seed in 1..1000u32,
393 factor in 1e-3f32..1e3f32,
394 ) {
395 let mut scores: Vec<f32> = Vec::with_capacity(8);
396 let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
397 for _ in 0..8 {
398 scores.push(x);
399 x *= 0.7;
400 }
401 let scaled: Vec<f32> = scores.iter().map(|s| s * factor).collect();
402 let mu_a = median_topk_margin_pct(&scores, 7);
403 let mu_b = median_topk_margin_pct(&scaled, 7);
404 prop_assert!(
405 (mu_a - mu_b).abs() < 1e-3,
406 "median margin pct should be scale-free: {} vs {}",
407 mu_a, mu_b
408 );
409 let h_a = normalized_entropy(&scores);
410 let h_b = normalized_entropy(&scaled);
411 prop_assert!(
412 (h_a - h_b).abs() < 1e-3,
413 "normalized entropy should be scale-free: {} vs {}",
414 h_a, h_b
415 );
416 }
417
418 /// normalized_entropy_in_unit_interval: the metric is always
419 /// in `[0, 1]` for any finite input.
420 #[test]
421 fn proptest_normalized_entropy_bounded(len in 2..32usize, seed in 1..1000u32) {
422 let mut scores: Vec<f32> = Vec::with_capacity(len);
423 let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
424 for i in 0..len {
425 #[allow(clippy::cast_precision_loss)]
426 scores.push(x + (i as f32) * 0.01);
427 x *= 0.9;
428 }
429 let h = normalized_entropy(&scores);
430 prop_assert!((0.0..=1.0).contains(&h), "entropy out of range: {}", h);
431 }
432
433 /// rank_agreement_labels_mutually_exclusive: the function
434 /// returns exactly one variant; the set of possible variants
435 /// is stable across runs.
436 #[test]
437 fn proptest_rank_agreement_total(len in 0..32usize, seed in 1..1000u32) {
438 let mut scores: Vec<f32> = Vec::with_capacity(len);
439 let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
440 for _ in 0..len {
441 scores.push(x);
442 x *= 0.85;
443 }
444 // Must terminate and return a variant; checked by pattern.
445 let label = rank_agreement(&scores);
446 prop_assert!(matches!(
447 label,
448 RankAgreement::High
449 | RankAgreement::Medium
450 | RankAgreement::Low
451 | RankAgreement::Insufficient
452 ));
453 }
454
455 /// insufficient_k_band_labeled: for `K < K_MIN_SHAPE_GATE`
456 /// the label is unconditionally `Insufficient`.
457 #[test]
458 fn proptest_insufficient_k_band(len in 0..K_MIN_SHAPE_GATE, seed in 1..1000u32) {
459 let mut scores: Vec<f32> = Vec::with_capacity(len);
460 let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
461 for _ in 0..len {
462 scores.push(x);
463 x *= 0.8;
464 }
465 prop_assert_eq!(rank_agreement(&scores), RankAgreement::Insufficient);
466 }
467
468 /// median_is_within_query_baseline: the median margin pct is
469 /// always in `[0, 1]` and is 0 iff all scores are equal.
470 #[test]
471 fn proptest_median_bounded(len in 2..32usize, seed in 1..1000u32) {
472 let mut scores: Vec<f32> = Vec::with_capacity(len);
473 let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
474 for _ in 0..len {
475 scores.push(x);
476 x *= 0.9;
477 }
478 let mu = median_topk_margin_pct(&scores, len - 1);
479 prop_assert!((0.0..=1.0).contains(&mu), "median out of range: {}", mu);
480 }
481 }
482}