mnem_graphrag/
confidence.rs

1//! Within-query confidence signals over retrieval score distributions
2//! (Gap 05, LD-primary).
3//!
4//! All signals are pure functions of the returned top-K scores. No
5//! globals, no rolling state, no I/O. Given the same input, two
6//! independent calls produce byte-identical output.
7//!
8//! # Design
9//!
10//! Given a descending-sorted score vector `[s1, s2, ..., sK]`:
11//!
12//! - [`normalized_entropy`]: Shannon entropy of a softmax over the
13//!   min-max-rescaled scores, divided by `ln(K)`. Dimensionless,
14//!   living in `[0, 1]`. `0` = one score dominates; `1` = all equal.
15//! - [`median_topk_margin_pct`]: median of the per-adjacent-pair
16//!   relative gaps `(s_i - s_{i+1}) / max(s_i, eps)` over the top-K.
17//!   The *within-query baseline*: is the top-1 lead unusual relative
18//!   to the rest of this ranking?
19//! - [`rank_agreement`]: categorical label (`High` / `Medium` / `Low`)
20//!   derived from the two metrics above. Auto-adapts per-query; no
21//!   hardcoded global threshold.
22//!
23//! # Why within-query
24//!
25//! Cross-query calibration (Gap 01) needs rolling-median state. This
26//! gap is *strictly within-query* and needs none of that plumbing.
27//! Consumers that want the hop-suggestion signal (Gap 01) combine
28//! [`RankAgreement`] with their rolling median externally.
29
30use serde::{Deserialize, Serialize};
31
32/// Minimum K for a statistically meaningful `rank_agreement` bucket.
33///
34/// At `K < 5` the within-query median of `K-1` adjacent margins is
35/// under-powered (Wilson 95% CI width > 0.45 at p=0.5). Callers with
36/// fewer items should treat the signal as insufficient.
37pub const K_MIN_SHAPE_GATE: usize = 5;
38
39/// Tiny epsilon used to avoid division by zero when the top score or a
40/// per-pair denominator is itself near zero.
41const EPS: f32 = 1e-9;
42
43/// Categorical confidence label for a returned top-K ranking.
44///
45/// Derived from the softmax-entropy and the within-query median
46/// adjacent margin. The mapping is intentionally coarse: the three-
47/// bucket view is what downstream consumers (Gap 01 `suggest_hop`, UI
48/// badges) actually use. The richer four-label string form from the
49/// solution design (confident / likely / tie / flat) is preserved in
50/// [`RankAgreement::as_fine_label`] for telemetry.
51#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq, Eq, Hash)]
52#[serde(rename_all = "lowercase")]
53pub enum RankAgreement {
54    /// The top-1 lead is unusually large relative to the rest of the
55    /// ranking *and* entropy is low. Downstream: do not suggest a hop.
56    /// Fine label: `confident`.
57    High,
58    /// The top-1 lead beats the within-query median margin but does
59    /// not meet the stricter entropy floor. Fine label: `likely`.
60    Medium,
61    /// Top-K is near-tied or flat. Downstream: Gap 01 `suggest_hop`
62    /// should fire (subject to its graph-size gate). Fine labels:
63    /// `tie` (top is near-tied with runners-up) and `flat` (entire
64    /// distribution is near-uniform).
65    Low,
66    /// `K < K_MIN_SHAPE_GATE`: sample too small to label meaningfully.
67    /// Consumers should treat this as "no signal".
68    Insufficient,
69}
70
71impl RankAgreement {
72    /// Stable fine-grained label for telemetry (`confident` / `likely`
73    /// / `tie` / `flat` / `insufficient_k`). The coarse three-bucket
74    /// [`RankAgreement`] variant is what programmatic consumers should
75    /// match on; this string is for dashboards and logs.
76    #[must_use]
77    pub const fn as_fine_label(self) -> &'static str {
78        match self {
79            Self::High => "confident",
80            Self::Medium => "likely",
81            Self::Low => "flat",
82            Self::Insufficient => "insufficient_k",
83        }
84    }
85}
86
87/// Compute the normalized Shannon entropy of a top-K score vector.
88///
89/// Scores are first shifted to be non-negative (by subtracting the
90/// minimum, which is scale-invariant under affine score transforms
91/// that preserve ordering) and then L1-normalized into a probability
92/// distribution. Shannon entropy of that distribution divided by
93/// `ln(K)` yields a dimensionless value in `[0, 1]`:
94///
95/// - `0.0`: a single score dominates (one-hot distribution).
96/// - `1.0`: all scores are equal (uniform distribution).
97///
98/// We do **not** use a softmax: for score ranges that are small in
99/// absolute terms, `exp` damps differences and the distribution
100/// collapses towards uniform even when one score is clearly peaked.
101/// L1-normalization preserves the relative magnitude structure that
102/// consumers of retrieval scores actually care about.
103///
104/// Returns `0.0` for `scores.len() < 2` (no meaningful distribution).
105/// Non-finite inputs (`NaN`, `+inf`, `-inf`) are treated as the score
106/// minimum so they do not poison the distribution.
107#[must_use]
108pub fn normalized_entropy(scores: &[f32]) -> f32 {
109    let k = scores.len();
110    if k < 2 {
111        return 0.0;
112    }
113
114    // Sanitize: replace non-finite entries with the finite minimum.
115    let finite_min = scores
116        .iter()
117        .copied()
118        .filter(|s| s.is_finite())
119        .fold(f32::INFINITY, f32::min);
120    let finite_min = if finite_min.is_finite() {
121        finite_min
122    } else {
123        0.0
124    };
125
126    let sanitized: Vec<f32> = scores
127        .iter()
128        .map(|&s| if s.is_finite() { s } else { finite_min })
129        .collect();
130
131    // Shift so min -> 0. Sum is scale-free in the sense that scaling
132    // every score by a positive constant produces proportional shifts.
133    let min = sanitized.iter().copied().fold(f32::INFINITY, f32::min);
134    let shifted: Vec<f32> = sanitized.iter().map(|&s| (s - min).max(0.0)).collect();
135    let sum: f32 = shifted.iter().sum();
136
137    // All scores equal => uniform distribution => entropy 1.0.
138    if sum <= EPS {
139        return 1.0;
140    }
141
142    let mut entropy = 0.0_f32;
143    for &x in &shifted {
144        let p = x / sum;
145        if p > EPS {
146            entropy -= p * p.ln();
147        }
148    }
149    let denom = (k as f32).ln().max(EPS);
150    (entropy / denom).clamp(0.0, 1.0)
151}
152
153/// Median of the per-adjacent-pair relative gaps over a top-K vector.
154///
155/// For an input `[s1, s2, ..., sK]` (assumed descending), returns
156/// `median_i { (s_i - s_{i+1}) / max(s_i, eps) }` over the `K-1`
157/// adjacent pairs, optionally trimmed to the first `k` pairs if `k`
158/// is smaller than `K-1`. When `k == 0` or the vector has fewer than
159/// 2 elements, returns `0.0`.
160///
161/// This is the *within-query baseline*: "how wide is a typical gap in
162/// this specific ranking?" [`rank_agreement`] uses it as the anchor
163/// against which the top-1 margin is compared.
164#[must_use]
165pub fn median_topk_margin_pct(scores: &[f32], k: usize) -> f32 {
166    if scores.len() < 2 || k == 0 {
167        return 0.0;
168    }
169    let pair_limit = (scores.len() - 1).min(k);
170    let mut pair_pcts: Vec<f32> = (0..pair_limit)
171        .map(|i| {
172            let num = scores[i] - scores[i + 1];
173            let denom = scores[i].abs().max(EPS);
174            (num / denom).max(0.0)
175        })
176        .collect();
177    if pair_pcts.is_empty() {
178        return 0.0;
179    }
180    // Sort with a total order: NaN-safe via partial_cmp fallback.
181    pair_pcts.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
182    let mid = pair_pcts.len() / 2;
183    if pair_pcts.len() % 2 == 0 {
184        0.5 * (pair_pcts[mid - 1] + pair_pcts[mid])
185    } else {
186        pair_pcts[mid]
187    }
188}
189
190/// Categorical confidence label for a top-K score vector.
191///
192/// See [`RankAgreement`] for the bucket semantics. The derivation is
193/// a pure function of the score distribution; no global thresholds.
194///
195/// # Algorithm
196///
197/// Let `m1 = (s1 - s2) / max(s1, eps)` be the top-1 relative margin,
198/// `mu = median_topk_margin_pct(scores, scores.len() - 1)` be the
199/// within-query baseline, and `h = normalized_entropy(scores)`.
200///
201/// - `High` when `m1 >= 2 * mu` and `h < 1 - mu` (top-1 is decisively
202///   ahead of the typical gap *and* the distribution is peaked).
203/// - `Medium` when `m1 > mu` (top-1 beats baseline but entropy is
204///   not tight enough for `High`).
205/// - `Low` when the ranking is near-tied (`m1 < mu / 4` with a
206///   still-positive top score) or generally flat.
207/// - `Insufficient` when `scores.len() < K_MIN_SHAPE_GATE` (5).
208#[must_use]
209pub fn rank_agreement(scores: &[f32]) -> RankAgreement {
210    if scores.len() < K_MIN_SHAPE_GATE {
211        return RankAgreement::Insufficient;
212    }
213
214    let s1 = scores[0];
215    let s2 = scores[1];
216    let s_last = *scores.last().expect("len >= K_MIN_SHAPE_GATE >= 5");
217
218    let top1_margin_pct = if s1.abs() > EPS {
219        ((s1 - s2) / s1).max(0.0)
220    } else {
221        0.0
222    };
223
224    let median_margin = median_topk_margin_pct(scores, scores.len() - 1);
225    let norm_entropy = normalized_entropy(scores);
226
227    // Degenerate flat: every score equal (or near-equal). No useful
228    // signal; we are in the Low bucket regardless of top1_margin_pct.
229    // Note: s_last is read to keep the name live and document the
230    // shape of the flat branch (s1 == s_last iff uniform).
231    let _ = s_last;
232    if median_margin < EPS {
233        return RankAgreement::Low;
234    }
235
236    // Tight: top-1 decisively ahead *and* distribution is non-uniform.
237    // The entropy floor is expressed as `1 - 2*median_margin` so a
238    // query with larger-than-typical gaps can carry High even when the
239    // softmax tail damps entropy towards 1.
240    if top1_margin_pct >= 2.0 * median_margin
241        && norm_entropy < (1.0 - 2.0 * median_margin).clamp(0.0, 0.999)
242    {
243        return RankAgreement::High;
244    }
245
246    // Top-1 beats the within-query baseline: Medium.
247    if top1_margin_pct > median_margin {
248        return RankAgreement::Medium;
249    }
250
251    // Otherwise: flat / tie.
252    RankAgreement::Low
253}
254
255// ============================================================
256// Tests
257// ============================================================
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262    use proptest::prelude::*;
263
264    #[test]
265    fn empty_scores_return_insufficient() {
266        assert_eq!(rank_agreement(&[]), RankAgreement::Insufficient);
267        assert!((normalized_entropy(&[]) - 0.0).abs() < 1e-6);
268        assert!((median_topk_margin_pct(&[], 5) - 0.0).abs() < 1e-6);
269    }
270
271    #[test]
272    fn single_score_returns_insufficient() {
273        assert_eq!(rank_agreement(&[0.9]), RankAgreement::Insufficient);
274        assert!((normalized_entropy(&[0.9]) - 0.0).abs() < 1e-6);
275    }
276
277    #[test]
278    fn four_scores_return_insufficient_k_label() {
279        // K < K_MIN_SHAPE_GATE (5)
280        let scores = [0.9, 0.3, 0.1, 0.05];
281        assert_eq!(rank_agreement(&scores), RankAgreement::Insufficient);
282        assert_eq!(
283            RankAgreement::Insufficient.as_fine_label(),
284            "insufficient_k"
285        );
286    }
287
288    #[test]
289    fn confident_peaked_distribution_yields_high() {
290        // Top-1 dominates: large margin, low entropy.
291        let scores = [0.95, 0.30, 0.10, 0.05, 0.02];
292        let label = rank_agreement(&scores);
293        assert!(
294            matches!(label, RankAgreement::High | RankAgreement::Medium),
295            "expected peaked top-1 to be High or Medium, got {label:?}"
296        );
297    }
298
299    #[test]
300    fn confident_triggers_high_on_sharp_peak() {
301        // Deliberately constructed so m1 >> 2*mu and entropy is low.
302        let scores = [0.99, 0.20, 0.18, 0.16, 0.15, 0.14];
303        assert_eq!(rank_agreement(&scores), RankAgreement::High);
304    }
305
306    #[test]
307    fn tie_near_uniform_yields_low() {
308        let scores = [0.80, 0.80, 0.80, 0.80, 0.79];
309        let label = rank_agreement(&scores);
310        assert_eq!(label, RankAgreement::Low);
311    }
312
313    #[test]
314    fn flat_distribution_yields_low() {
315        // All approximately equal: entropy near 1.0, no standout.
316        let scores = [0.50, 0.50, 0.50, 0.50, 0.50];
317        assert_eq!(rank_agreement(&scores), RankAgreement::Low);
318    }
319
320    #[test]
321    fn confidence_is_deterministic() {
322        let scores = [0.91, 0.60, 0.42, 0.30, 0.15, 0.05];
323        let a = (
324            rank_agreement(&scores),
325            normalized_entropy(&scores),
326            median_topk_margin_pct(&scores, 5),
327        );
328        let b = (
329            rank_agreement(&scores),
330            normalized_entropy(&scores),
331            median_topk_margin_pct(&scores, 5),
332        );
333        assert_eq!(a.0, b.0);
334        assert!((a.1 - b.1).abs() < 1e-7);
335        assert!((a.2 - b.2).abs() < 1e-7);
336    }
337
338    #[test]
339    fn margin_pct_scale_free_and_label_unchanged_under_rescale() {
340        // Scale-invariance property: multiplying every score by the
341        // same positive constant must not change the categorical label.
342        let scores = [0.91, 0.60, 0.42, 0.30, 0.15, 0.05];
343        let scaled: Vec<f32> = scores.iter().map(|s| s * 10.0).collect();
344        assert_eq!(rank_agreement(&scores), rank_agreement(&scaled));
345        let mu_a = median_topk_margin_pct(&scores, 5);
346        let mu_b = median_topk_margin_pct(&scaled, 5);
347        assert!(
348            (mu_a - mu_b).abs() < 1e-5,
349            "median margin pct should be scale-free: got {mu_a} vs {mu_b}"
350        );
351    }
352
353    #[test]
354    fn normalized_entropy_uniform_is_one() {
355        let scores = [0.5, 0.5, 0.5, 0.5, 0.5];
356        let h = normalized_entropy(&scores);
357        assert!((h - 1.0).abs() < 1e-5, "expected 1.0, got {h}");
358    }
359
360    #[test]
361    fn normalized_entropy_one_hot_is_low() {
362        let scores = [1.0, 0.0, 0.0, 0.0, 0.0];
363        let h = normalized_entropy(&scores);
364        assert!(
365            h < 1.0,
366            "one-hot distribution should have sub-uniform entropy, got {h}"
367        );
368    }
369
370    #[test]
371    fn nonfinite_inputs_do_not_panic() {
372        let scores = [f32::NAN, 0.8, 0.5, 0.2, 0.0];
373        let _ = rank_agreement(&scores);
374        let _ = normalized_entropy(&scores);
375        let _ = median_topk_margin_pct(&scores, 4);
376    }
377
378    // ============================================================
379    // Property tests (derived from code-sketch named tests)
380    // ============================================================
381
382    proptest! {
383        /// margin_pct_scale_free: scaling all scores by a positive
384        /// constant must leave the median_topk_margin_pct baseline
385        /// approximately unchanged (scale-free derivation). Labels
386        /// can flip at the Medium/Low boundary when the top-1 margin
387        /// equals the median baseline exactly; we assert the
388        /// numerical baseline instead, which is the load-bearing
389        /// invariant.
390        #[test]
391        fn proptest_margin_pct_scale_free(
392            seed in 1..1000u32,
393            factor in 1e-3f32..1e3f32,
394        ) {
395            let mut scores: Vec<f32> = Vec::with_capacity(8);
396            let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
397            for _ in 0..8 {
398                scores.push(x);
399                x *= 0.7;
400            }
401            let scaled: Vec<f32> = scores.iter().map(|s| s * factor).collect();
402            let mu_a = median_topk_margin_pct(&scores, 7);
403            let mu_b = median_topk_margin_pct(&scaled, 7);
404            prop_assert!(
405                (mu_a - mu_b).abs() < 1e-3,
406                "median margin pct should be scale-free: {} vs {}",
407                mu_a, mu_b
408            );
409            let h_a = normalized_entropy(&scores);
410            let h_b = normalized_entropy(&scaled);
411            prop_assert!(
412                (h_a - h_b).abs() < 1e-3,
413                "normalized entropy should be scale-free: {} vs {}",
414                h_a, h_b
415            );
416        }
417
418        /// normalized_entropy_in_unit_interval: the metric is always
419        /// in `[0, 1]` for any finite input.
420        #[test]
421        fn proptest_normalized_entropy_bounded(len in 2..32usize, seed in 1..1000u32) {
422            let mut scores: Vec<f32> = Vec::with_capacity(len);
423            let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
424            for i in 0..len {
425                #[allow(clippy::cast_precision_loss)]
426                scores.push(x + (i as f32) * 0.01);
427                x *= 0.9;
428            }
429            let h = normalized_entropy(&scores);
430            prop_assert!((0.0..=1.0).contains(&h), "entropy out of range: {}", h);
431        }
432
433        /// rank_agreement_labels_mutually_exclusive: the function
434        /// returns exactly one variant; the set of possible variants
435        /// is stable across runs.
436        #[test]
437        fn proptest_rank_agreement_total(len in 0..32usize, seed in 1..1000u32) {
438            let mut scores: Vec<f32> = Vec::with_capacity(len);
439            let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
440            for _ in 0..len {
441                scores.push(x);
442                x *= 0.85;
443            }
444            // Must terminate and return a variant; checked by pattern.
445            let label = rank_agreement(&scores);
446            prop_assert!(matches!(
447                label,
448                RankAgreement::High
449                    | RankAgreement::Medium
450                    | RankAgreement::Low
451                    | RankAgreement::Insufficient
452            ));
453        }
454
455        /// insufficient_k_band_labeled: for `K < K_MIN_SHAPE_GATE`
456        /// the label is unconditionally `Insufficient`.
457        #[test]
458        fn proptest_insufficient_k_band(len in 0..K_MIN_SHAPE_GATE, seed in 1..1000u32) {
459            let mut scores: Vec<f32> = Vec::with_capacity(len);
460            let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
461            for _ in 0..len {
462                scores.push(x);
463                x *= 0.8;
464            }
465            prop_assert_eq!(rank_agreement(&scores), RankAgreement::Insufficient);
466        }
467
468        /// median_is_within_query_baseline: the median margin pct is
469        /// always in `[0, 1]` and is 0 iff all scores are equal.
470        #[test]
471        fn proptest_median_bounded(len in 2..32usize, seed in 1..1000u32) {
472            let mut scores: Vec<f32> = Vec::with_capacity(len);
473            let mut x = f32::from(u16::try_from(seed % 1000).unwrap_or(1)) / 1000.0 + 0.1;
474            for _ in 0..len {
475                scores.push(x);
476                x *= 0.9;
477            }
478            let mu = median_topk_margin_pct(&scores, len - 1);
479            prop_assert!((0.0..=1.0).contains(&mu), "median out of range: {}", mu);
480        }
481    }
482}
mnem_graphrag/confidence.rs

mnem_graphrag/
confidence.rs