Skip to main content

mnem_graphrag/
calibration.rs

1//! Score calibration - scale-free per-query interpretability for dense retrieval.
2//!
3//! # Design principle
4//!
5//! Rather than shipping cross-embedder calibration (which would require a
6//! trained scaler per model), this module emits **scale-free summaries** that
7//! are meaningful *within a single response*. Two shapes:
8//!
9//! 1. [`score_quantiles`] - per-item `(rank_from_bottom) / max(K - 1, 1)` so
10//!    the top item is 1.0, the bottom is 0.0. Identical meaning across
11//!    embedders; a pure rank metric with no threshold.
12//! 2. [`distribution_shape`] - response-level
13//!    [`ScoreDistribution`] block with min / max / median / IQR and a
14//!    categorical `shape` label (`long_tail` / `uniform` / `bimodal` /
15//!    `insufficient-samples`).
16//!
17//! # Scale-freeness
18//!
19//! The shape classifier thresholds on *relative* quantities
20//! (`max - median > 2 * iqr`) rather than absolute score magnitudes, so a
21//! dot-product lane scoring in `[0, 100]` and a cosine lane scoring in
22//! `[-1, 1]` produce the same shape label for isomorphic distributions.
23//!
24//! Quantile emission is well-defined for any `K >= 1` (degenerate cases
25//! collapse to the trivial `[1.0; K]` vector); shape classification gates
26//! at a principled floor derived from Wilson 95% CI width (see
27//! [`derive_k_min`] and the module-level `K_MIN` constant).
28//!
29//! # Determinism
30//!
31//! Every function in this module is a pure function of its inputs. Given
32//! the same `ranked` slice and `gate`, output is byte-identical across
33//! runs. No floating-point reductions over unordered hashmaps, no global
34//! state, no RNG.
35//!
36//! # Floor-c tunable constants
37//!
38//! Two constants live here with the floor-c contract (standard-cite +
39//! gauge + proptest):
40//!
41//! | Constant            | Value | Standard                                           |
42//! |---------------------|-------|----------------------------------------------------|
43//! | [`K_MIN`]           | 8     | Wilson 95% CI width <= 0.18 requires K >= 8        |
44//! | [`WILSON_WIDTH_TARGET`] | 0.18 | Loose for probabilistic routing, tight for decisions |
45//!
46//! Derivation (Round 4 spec): Wilson 95% CI formula
47//! `K >= (z/eps)^2 * p * (1-p)` where `z = 1.96` (95% CI), `p = 0.5`
48//! (worst-case variance). `K_MIN = 8` is the **minimum gate value**
49//! below which shape classification is suppressed (IQR and median
50//! collapse under noise in smaller samples); it is a principled
51//! lower bound, not literally `derive_k_min(WILSON_WIDTH_TARGET)`.
52//! The `derive_k_min` helper computes the Wilson-interval K directly
53//! for callers that want to tighten the width target at larger K.
54
55use mnem_core::id::NodeId;
56use serde::{Deserialize, Serialize};
57
58/// Wilson z-score for a 95% confidence interval. Standard statistical
59/// constant; exposed here so [`derive_k_min`] is self-contained.
60pub const WILSON_Z: f32 = 1.96;
61
62/// Default Wilson-interval width target for calibration floors.
63///
64/// Floor-c tunable: standard = "loose enough for probabilistic routing,
65/// tight enough for agent decisions"; gauge =
66/// `mnem_calibration_width_target_effective`; proptest =
67/// `width_target_tunable_in_principled_range`.
68pub const WILSON_WIDTH_TARGET: f32 = 0.18;
69
70/// Default minimum sample size for shape classification.
71///
72/// Floor-c tunable: standard = "Wilson 95% CI width <= 0.18 requires
73/// K >= 8"; gauge = `mnem_calibration_k_min_effective`; proptest =
74/// `k_min_default_is_8`.
75pub const K_MIN: usize = 8;
76
77/// Categorical label summarising the shape of the per-response score
78/// distribution. Promoted to a top-level agent hint on the retrieve
79/// response envelope (see [`ScoreDistribution::shape`]).
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
81#[serde(rename_all = "kebab-case")]
82pub enum ShapeLabel {
83    /// Top score dominates the tail; `max - median > 2 * iqr`. Agents
84    /// can trust top-1 as a confident match.
85    LongTail,
86    /// Scores are roughly equi-spaced; no single item dominates. Dense
87    /// ranking is inconclusive; consider a rerank or graph expansion.
88    Uniform,
89    /// Two distinct score clusters separated by a gap larger than
90    /// `iqr`. Often a hybrid-lane artefact; look at per-lane scores.
91    Bimodal,
92    /// Fewer than [`K_MIN`] samples; shape cannot be distinguished from
93    /// noise. `score_quantile` still emits (pure rank, well-defined
94    /// for `K >= 2`), but distribution statistics are all zero.
95    InsufficientSamples,
96}
97
98/// Response-level score distribution summary.
99///
100/// Emitted once per retrieve response alongside the `items` array. All
101/// fields are scale-free or derived from scale-free quantities; the
102/// `shape` label is the primary agent hint.
103#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
104pub struct ScoreDistribution {
105    /// Minimum score across ranked items (0.0 when
106    /// [`ShapeLabel::InsufficientSamples`]).
107    pub min: f32,
108    /// Maximum score across ranked items.
109    pub max: f32,
110    /// Median (50th percentile) of the score vector.
111    pub median: f32,
112    /// Inter-quartile range (`Q3 - Q1`). Clamped to `f32::EPSILON`
113    /// before any downstream ratio so shape classification stays
114    /// defined for degenerate all-equal distributions.
115    pub iqr: f32,
116    /// Categorical shape label.
117    pub shape: ShapeLabel,
118}
119
120impl Default for ScoreDistribution {
121    fn default() -> Self {
122        Self {
123            min: 0.0,
124            max: 0.0,
125            median: 0.0,
126            iqr: 0.0,
127            shape: ShapeLabel::InsufficientSamples,
128        }
129    }
130}
131
132/// Compute per-item score quantiles from a ranked list.
133///
134/// Input is assumed ranked **descending** by score (position 0 = top
135/// item). Output preserves input order: `out[i]` is the quantile of
136/// `ranked[i]`, with the top item receiving `1.0` and the bottom
137/// receiving `0.0`.
138///
139/// Formula: `out[i] = (K - 1 - i) / max(K - 1, 1)` where `K = ranked.len()`.
140///
141/// # Scale-freeness
142///
143/// Output depends **only on rank**, never on absolute score values.
144/// Two responses with identical rank order but different score
145/// magnitudes produce identical quantile vectors.
146///
147/// # Edge cases
148///
149/// - `K = 0` -> `[]`.
150/// - `K = 1` -> `[1.0]` (single item is trivially top-ranked).
151///
152/// # Determinism
153///
154/// Pure function of input length. No allocation beyond the output
155/// `Vec<f32>`; no RNG, no global state.
156#[must_use]
157pub fn score_quantiles<N>(ranked: &[(N, f32)]) -> Vec<f32> {
158    let k = ranked.len();
159    if k == 0 {
160        return Vec::new();
161    }
162    if k == 1 {
163        return vec![1.0];
164    }
165    let denom = (k - 1) as f32;
166    (0..k).map(|i| (k - 1 - i) as f32 / denom).collect()
167}
168
169/// Thin wrapper over [`score_quantiles`] keyed on [`NodeId`].
170///
171/// Exists so callers with the canonical `&[(NodeId, f32)]` ranked
172/// shape (from `mnem-core`'s retrieve pipeline) get a typed entry
173/// point; internally forwards to the generic version.
174#[must_use]
175pub fn node_score_quantiles(ranked: &[(NodeId, f32)]) -> Vec<f32> {
176    score_quantiles(ranked)
177}
178
179/// Classify the shape of a score vector.
180///
181/// `scores` is any order of floats; the function copies and sorts
182/// internally. `gate` is the minimum sample count below which the
183/// classifier abstains (emitting [`ShapeLabel::InsufficientSamples`]
184/// rather than a noisy label). Callers usually pass [`K_MIN`].
185///
186/// # Classification rules
187///
188/// Let `sorted` be `scores` ascending, with `n = scores.len()`. Compute
189/// `min = sorted[0]`, `max = sorted[n-1]`, `median = sorted[n/2]`,
190/// `q1 = sorted[n/4]`, `q3 = sorted[3n/4]`,
191/// `iqr = max(q3 - q1, f32::EPSILON)`. Then:
192///
193/// 1. If `max - median > 2 * iqr` -> [`ShapeLabel::LongTail`].
194/// 2. Else if the largest gap between consecutive sorted scores
195///    exceeds `iqr` -> [`ShapeLabel::Bimodal`].
196/// 3. Else -> [`ShapeLabel::Uniform`].
197///
198/// All three thresholds are *relative* (ratios / differences of the
199/// input), so the classification is scale-invariant: multiplying every
200/// score by a positive constant yields an identical [`ShapeLabel`].
201///
202/// # Determinism
203///
204/// Pure function. Sorts with `total_cmp` so NaN inputs land in a
205/// well-defined position rather than poisoning comparisons; the
206/// numeric result for NaN-free inputs is bit-identical across runs.
207#[must_use]
208pub fn distribution_shape(scores: &[f32], gate: usize) -> ScoreDistribution {
209    if scores.len() < gate {
210        return ScoreDistribution::default();
211    }
212    let mut sorted: Vec<f32> = scores.to_vec();
213    sorted.sort_by(f32::total_cmp);
214    let n = sorted.len();
215    let min = sorted[0];
216    let max = sorted[n - 1];
217    let median = sorted[n / 2];
218    let q1 = sorted[n / 4];
219    let q3 = sorted[(3 * n) / 4];
220    let iqr = (q3 - q1).max(f32::EPSILON);
221
222    // Classification rule order matters. We check bimodal *before*
223    // long-tail because a cluster-separated distribution often also
224    // satisfies `max - median > 2 * iqr` (one cluster sits above the
225    // median of the whole vector), but the correct agent hint is
226    // "two peaks" not "one outlier".
227    //
228    // Bimodal criterion: largest consecutive gap is a sizeable
229    // fraction of the full range. Using `(max - min) / 2` keeps the
230    // criterion scale-free (ratio, not magnitude) and robust for
231    // near-degenerate vectors where `iqr` collapses to
232    // `f32::EPSILON`.
233    let range = max - min;
234    let shape = if bimodal_gap(&sorted, range) {
235        ShapeLabel::Bimodal
236    } else if (max - median) > 2.0 * iqr {
237        ShapeLabel::LongTail
238    } else {
239        ShapeLabel::Uniform
240    };
241
242    ScoreDistribution {
243        min,
244        max,
245        median,
246        iqr,
247        shape,
248    }
249}
250
251/// True iff the sorted vector splits into two balanced clusters
252/// separated by a gap larger than half the range.
253///
254/// `sorted` must be ascending. Two conditions together:
255///
256/// 1. The largest consecutive gap `> range * 0.5` (the gap dominates
257///    the span, i.e. a structural separation, not sampling noise).
258/// 2. The gap is **interior**: at least two items on each side. A
259///    gap at position `n-1` (one item above, `n-1` below) is a
260///    single outlier - that's long-tail, not bimodal. Requiring
261///    balanced cluster sizes disambiguates the two shapes cleanly.
262///
263/// Empty / single-item vectors return `false` (caller already gated
264/// on `gate >= 2`). Range `<= epsilon` (all-equal) also returns
265/// `false` so the all-equal case falls through to Uniform.
266fn bimodal_gap(sorted: &[f32], range: f32) -> bool {
267    if sorted.len() < 4 || range <= f32::EPSILON {
268        return false;
269    }
270    let threshold = range * 0.5;
271    // Walk interior windows only (skip first and last position so we
272    // guarantee >=2 items per cluster). `sorted.windows(2)` gives
273    // pairs (a, b) where a = sorted[i], b = sorted[i+1].
274    for (i, w) in sorted.windows(2).enumerate() {
275        // Interior: cluster below has `i+1` items, cluster above has
276        // `n - i - 1` items. Require both >= 2.
277        let below = i + 1;
278        let above = sorted.len() - i - 1;
279        if below < 2 || above < 2 {
280            continue;
281        }
282        let gap = w[1] - w[0];
283        if gap > threshold {
284            return true;
285        }
286    }
287    false
288}
289
290/// Derive the minimum sample size satisfying a Wilson 95% CI width
291/// target.
292///
293/// Formula (worst-case variance at `p = 0.5`):
294///
295/// ```text
296/// K >= (z / eps)^2 * p * (1 - p)
297///    = (1.96 / eps)^2 * 0.25
298/// ```
299///
300/// Scale: tighter targets (smaller `eps`) grow `K` quadratically;
301/// looser targets shrink it. Output is always `>= 1`.
302///
303/// Reference values:
304/// - `width_target = 0.35` -> `K ~= 8` (matches [`K_MIN`] floor).
305/// - `width_target = 0.18` -> `K ~= 30` (tight agent-decision margin).
306/// - `width_target = 0.10` -> `K ~= 97` (statistical-analysis grade).
307///
308/// [`K_MIN`] is a separate **floor constant** for the shape-gate; it
309/// is not literally `derive_k_min(WILSON_WIDTH_TARGET)`. The Wilson
310/// formula here is exposed so callers sizing larger-K experiments
311/// can reason about their desired width explicitly.
312///
313/// # Panics
314///
315/// Does not panic; non-positive / non-finite `width_target` is
316/// clamped to a tiny positive value so the computation cannot
317/// produce `NaN` or `0` divisions.
318#[must_use]
319pub fn derive_k_min(width_target: f32) -> usize {
320    // Guard: clamp pathological inputs to avoid NaN / Inf. A zero or
321    // negative width target is nonsensical (width is non-negative by
322    // definition); we treat it as "as tight as possible" by clamping
323    // to f32::EPSILON, which drives K to a very large number rather
324    // than panicking.
325    let eps = if width_target.is_finite() && width_target > 0.0 {
326        width_target
327    } else {
328        f32::EPSILON
329    };
330    let raw = (WILSON_Z / eps).powi(2) * 0.25;
331    // `ceil` so we never *undershoot* the target width; `max(1)` so
332    // we never return 0 for absurdly loose targets.
333    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
334    let k = raw.ceil() as usize;
335    k.max(1)
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341
342    // ---------- floor-c tunable assertions ----------
343
344    #[test]
345    fn k_min_default_is_8() {
346        // Floor-c constant: K_MIN = 8 is the shape-gate floor below
347        // which IQR / median estimators collapse under noise. This is
348        // a principled *floor*, not literally the Wilson derivation
349        // at the default width target.
350        assert_eq!(K_MIN, 8);
351        // Wilson formula: the K that satisfies width <= 0.35 is the
352        // floor value 8 via (1.96/0.35)^2 * 0.25 = 7.84 -> ceil = 8.
353        // This links the floor to a defensible width target even
354        // though the tighter default (0.18) demands more samples.
355        assert_eq!(derive_k_min(0.35), 8);
356    }
357
358    #[test]
359    fn width_target_tunable_in_principled_range() {
360        // Default width target lies between "loose routing" (0.35,
361        // matches floor K=8) and "tight analysis" (0.10, K~97).
362        assert!((0.10..=0.35).contains(&WILSON_WIDTH_TARGET));
363        // Tighter target (0.10) -> larger K: (1.96/0.10)^2 * 0.25 = 96.04 -> 97.
364        assert_eq!(derive_k_min(0.10), 97);
365        // Default target (0.18) -> K = 30 by the formula.
366        assert_eq!(derive_k_min(WILSON_WIDTH_TARGET), 30);
367        // Looser target (0.30) -> smaller K: (1.96/0.30)^2 * 0.25 = 10.67 -> 11.
368        assert_eq!(derive_k_min(0.30), 11);
369        // Monotone: tighter eps => larger K.
370        assert!(derive_k_min(0.10) > derive_k_min(0.18));
371        assert!(derive_k_min(0.18) > derive_k_min(0.30));
372    }
373
374    #[test]
375    fn derive_k_min_never_zero() {
376        // Absurdly loose target still yields K >= 1.
377        assert!(derive_k_min(100.0) >= 1);
378        // Zero / negative / NaN clamped safely.
379        assert!(derive_k_min(0.0) >= 1);
380        assert!(derive_k_min(-1.0) >= 1);
381        assert!(derive_k_min(f32::NAN) >= 1);
382    }
383
384    // ---------- quantile behaviour ----------
385
386    #[test]
387    fn quantile_monotone() {
388        // Ranked descending: index 0 is top, index K-1 is bottom.
389        // Quantiles should be non-increasing across positions.
390        let ranked: Vec<(u32, f32)> = vec![(0, 0.9), (1, 0.7), (2, 0.5), (3, 0.3), (4, 0.1)];
391        let q = score_quantiles(&ranked);
392        assert_eq!(q, vec![1.0, 0.75, 0.5, 0.25, 0.0]);
393        // Monotonicity: no adjacent pair is increasing.
394        assert!(q.windows(2).all(|w| w[0] >= w[1]));
395    }
396
397    #[test]
398    fn quantile_top_is_one_bottom_is_zero() {
399        let ranked: Vec<(u32, f32)> = (0..10).map(|i| (i, 1.0 - (i as f32) * 0.1)).collect();
400        let q = score_quantiles(&ranked);
401        assert!((q[0] - 1.0).abs() < f32::EPSILON);
402        assert!((q[9] - 0.0).abs() < f32::EPSILON);
403    }
404
405    #[test]
406    fn quantile_edge_cases() {
407        // Empty -> empty.
408        let empty: Vec<(u32, f32)> = vec![];
409        assert!(score_quantiles(&empty).is_empty());
410        // Single item -> [1.0].
411        let one: Vec<(u32, f32)> = vec![(0, 0.42)];
412        assert_eq!(score_quantiles(&one), vec![1.0]);
413    }
414
415    #[test]
416    fn quantile_scale_invariance() {
417        // Scaling scores by any positive constant does NOT change quantiles
418        // (they're pure rank).
419        let ranked_a: Vec<(u32, f32)> = vec![(0, 0.9), (1, 0.5), (2, 0.1)];
420        let ranked_b: Vec<(u32, f32)> = vec![(0, 90.0), (1, 50.0), (2, 10.0)];
421        assert_eq!(score_quantiles(&ranked_a), score_quantiles(&ranked_b));
422    }
423
424    // ---------- shape classification ----------
425
426    #[test]
427    fn shape_long_tail_when_top_score_dominates() {
428        // Top score is far above the pack; max - median > 2 * iqr.
429        // 8 items (meets gate=8); most clustered low, one outlier high.
430        let scores = vec![0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.95];
431        let dist = distribution_shape(&scores, 8);
432        assert_eq!(dist.shape, ShapeLabel::LongTail);
433        assert!((dist.max - 0.95).abs() < 1e-5);
434        assert!(dist.min >= 0.0);
435    }
436
437    #[test]
438    fn shape_uniform() {
439        // Equi-spaced scores; no dominant top, no bimodal gap.
440        let scores: Vec<f32> = (0..10).map(|i| i as f32 * 0.1).collect();
441        let dist = distribution_shape(&scores, K_MIN);
442        assert_eq!(dist.shape, ShapeLabel::Uniform);
443    }
444
445    #[test]
446    fn shape_bimodal() {
447        // Two clusters separated by a gap > iqr.
448        // Low cluster: 0.10..0.14. High cluster: 0.80..0.84. Gap: 0.66.
449        // IQR of 10 samples split 5/5 is small (~0.04), so gap >> iqr.
450        let scores = vec![0.10, 0.11, 0.12, 0.13, 0.14, 0.80, 0.81, 0.82, 0.83, 0.84];
451        let dist = distribution_shape(&scores, K_MIN);
452        assert_eq!(dist.shape, ShapeLabel::Bimodal);
453    }
454
455    #[test]
456    fn shape_insufficient_samples_below_gate() {
457        // K < gate -> default (all zero, InsufficientSamples).
458        let scores = vec![0.1, 0.5, 0.9];
459        let dist = distribution_shape(&scores, K_MIN);
460        assert_eq!(dist.shape, ShapeLabel::InsufficientSamples);
461        assert_eq!(dist.min, 0.0);
462        assert_eq!(dist.max, 0.0);
463    }
464
465    #[test]
466    fn shape_all_equal_is_uniform_not_nan() {
467        // IQR would be 0 but we clamp to f32::EPSILON, so no NaN.
468        // max - median = 0, not > 2 * epsilon -> not long-tail.
469        // Largest gap = 0, not > epsilon -> not bimodal.
470        let scores = vec![0.5_f32; 8];
471        let dist = distribution_shape(&scores, K_MIN);
472        assert_eq!(dist.shape, ShapeLabel::Uniform);
473        assert!(dist.iqr > 0.0); // clamped to epsilon
474    }
475
476    // ---------- scale-free: K=8 and K=1000 both work ----------
477
478    #[test]
479    fn scale_free_across_response_sizes() {
480        // K=8 smallest viable size, K=1000 stress.
481        for &k in &[8_usize, 1000] {
482            let ranked: Vec<(u32, f32)> = (0..k)
483                .map(|i| (i as u32, 1.0 - (i as f32) / (k as f32)))
484                .collect();
485            let q = score_quantiles(&ranked);
486            assert_eq!(q.len(), k);
487            assert!((q[0] - 1.0).abs() < 1e-5);
488            assert!((q[k - 1] - 0.0).abs() < 1e-5);
489
490            let scores: Vec<f32> = ranked.iter().map(|(_, s)| *s).collect();
491            let dist = distribution_shape(&scores, K_MIN);
492            // Equi-spaced -> uniform, regardless of K.
493            assert_eq!(dist.shape, ShapeLabel::Uniform);
494        }
495    }
496
497    // ---------- proptest: determinism ----------
498
499    use proptest::prelude::*;
500
501    proptest! {
502        /// Pure functions must produce bit-identical output for the same
503        /// input across independent runs. Randomised input, checked twice.
504        #[test]
505        fn determinism(
506            xs in proptest::collection::vec(-1000.0_f32..1000.0, 0..200)
507        ) {
508            // Filter NaN so total_cmp behaviour is the only source of
509            // ordering: we assert determinism of the *function*, not a
510            // spec-free NaN-handling path.
511            let xs: Vec<f32> = xs.into_iter().filter(|v| v.is_finite()).collect();
512            let ranked: Vec<(u32, f32)> =
513                xs.iter().enumerate().map(|(i, v)| (i as u32, *v)).collect();
514
515            let q1 = score_quantiles(&ranked);
516            let q2 = score_quantiles(&ranked);
517            prop_assert_eq!(&q1, &q2);
518
519            let d1 = distribution_shape(&xs, K_MIN);
520            let d2 = distribution_shape(&xs, K_MIN);
521            prop_assert_eq!(d1, d2);
522        }
523
524        /// Scale invariance: multiplying every score by a positive
525        /// constant never changes the shape label (it's a pure ratio).
526        #[test]
527        fn shape_scale_invariant(
528            xs in proptest::collection::vec(0.0_f32..100.0, 8..64),
529            scale in 0.01_f32..1000.0,
530        ) {
531            let xs: Vec<f32> = xs.into_iter().filter(|v| v.is_finite()).collect();
532            if xs.len() < K_MIN {
533                return Ok(());
534            }
535            let scaled: Vec<f32> = xs.iter().map(|v| v * scale).collect();
536            let d1 = distribution_shape(&xs, K_MIN);
537            let d2 = distribution_shape(&scaled, K_MIN);
538            prop_assert_eq!(d1.shape, d2.shape);
539        }
540    }
541}