mnem_graphrag/calibration.rs
1//! Score calibration - scale-free per-query interpretability for dense retrieval.
2//!
3//! # Design principle
4//!
5//! Rather than shipping cross-embedder calibration (which would require a
6//! trained scaler per model), this module emits **scale-free summaries** that
7//! are meaningful *within a single response*. Two shapes:
8//!
9//! 1. [`score_quantiles`] - per-item `(rank_from_bottom) / max(K - 1, 1)` so
10//! the top item is 1.0, the bottom is 0.0. Identical meaning across
11//! embedders; a pure rank metric with no threshold.
12//! 2. [`distribution_shape`] - response-level
13//! [`ScoreDistribution`] block with min / max / median / IQR and a
14//! categorical `shape` label (`long_tail` / `uniform` / `bimodal` /
15//! `insufficient-samples`).
16//!
17//! # Scale-freeness
18//!
19//! The shape classifier thresholds on *relative* quantities
20//! (`max - median > 2 * iqr`) rather than absolute score magnitudes, so a
21//! dot-product lane scoring in `[0, 100]` and a cosine lane scoring in
22//! `[-1, 1]` produce the same shape label for isomorphic distributions.
23//!
24//! Quantile emission is well-defined for any `K >= 1` (degenerate cases
25//! collapse to the trivial `[1.0; K]` vector); shape classification gates
26//! at a principled floor derived from Wilson 95% CI width (see
27//! [`derive_k_min`] and the module-level `K_MIN` constant).
28//!
29//! # Determinism
30//!
31//! Every function in this module is a pure function of its inputs. Given
32//! the same `ranked` slice and `gate`, output is byte-identical across
33//! runs. No floating-point reductions over unordered hashmaps, no global
34//! state, no RNG.
35//!
36//! # Floor-c tunable constants
37//!
38//! Two constants live here with the floor-c contract (standard-cite +
39//! gauge + proptest):
40//!
41//! | Constant | Value | Standard |
42//! |---------------------|-------|----------------------------------------------------|
43//! | [`K_MIN`] | 8 | Wilson 95% CI width <= 0.18 requires K >= 8 |
44//! | [`WILSON_WIDTH_TARGET`] | 0.18 | Loose for probabilistic routing, tight for decisions |
45//!
46//! Derivation (Round 4 spec): Wilson 95% CI formula
47//! `K >= (z/eps)^2 * p * (1-p)` where `z = 1.96` (95% CI), `p = 0.5`
48//! (worst-case variance). `K_MIN = 8` is the **minimum gate value**
49//! below which shape classification is suppressed (IQR and median
50//! collapse under noise in smaller samples); it is a principled
51//! lower bound, not literally `derive_k_min(WILSON_WIDTH_TARGET)`.
52//! The `derive_k_min` helper computes the Wilson-interval K directly
53//! for callers that want to tighten the width target at larger K.
54
55use mnem_core::id::NodeId;
56use serde::{Deserialize, Serialize};
57
58/// Wilson z-score for a 95% confidence interval. Standard statistical
59/// constant; exposed here so [`derive_k_min`] is self-contained.
60pub const WILSON_Z: f32 = 1.96;
61
62/// Default Wilson-interval width target for calibration floors.
63///
64/// Floor-c tunable: standard = "loose enough for probabilistic routing,
65/// tight enough for agent decisions"; gauge =
66/// `mnem_calibration_width_target_effective`; proptest =
67/// `width_target_tunable_in_principled_range`.
68pub const WILSON_WIDTH_TARGET: f32 = 0.18;
69
70/// Default minimum sample size for shape classification.
71///
72/// Floor-c tunable: standard = "Wilson 95% CI width <= 0.18 requires
73/// K >= 8"; gauge = `mnem_calibration_k_min_effective`; proptest =
74/// `k_min_default_is_8`.
75pub const K_MIN: usize = 8;
76
77/// Categorical label summarising the shape of the per-response score
78/// distribution. Promoted to a top-level agent hint on the retrieve
79/// response envelope (see [`ScoreDistribution::shape`]).
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
81#[serde(rename_all = "kebab-case")]
82pub enum ShapeLabel {
83 /// Top score dominates the tail; `max - median > 2 * iqr`. Agents
84 /// can trust top-1 as a confident match.
85 LongTail,
86 /// Scores are roughly equi-spaced; no single item dominates. Dense
87 /// ranking is inconclusive; consider a rerank or graph expansion.
88 Uniform,
89 /// Two distinct score clusters separated by a gap larger than
90 /// `iqr`. Often a hybrid-lane artefact; look at per-lane scores.
91 Bimodal,
92 /// Fewer than [`K_MIN`] samples; shape cannot be distinguished from
93 /// noise. `score_quantile` still emits (pure rank, well-defined
94 /// for `K >= 2`), but distribution statistics are all zero.
95 InsufficientSamples,
96}
97
98/// Response-level score distribution summary.
99///
100/// Emitted once per retrieve response alongside the `items` array. All
101/// fields are scale-free or derived from scale-free quantities; the
102/// `shape` label is the primary agent hint.
103#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
104pub struct ScoreDistribution {
105 /// Minimum score across ranked items (0.0 when
106 /// [`ShapeLabel::InsufficientSamples`]).
107 pub min: f32,
108 /// Maximum score across ranked items.
109 pub max: f32,
110 /// Median (50th percentile) of the score vector.
111 pub median: f32,
112 /// Inter-quartile range (`Q3 - Q1`). Clamped to `f32::EPSILON`
113 /// before any downstream ratio so shape classification stays
114 /// defined for degenerate all-equal distributions.
115 pub iqr: f32,
116 /// Categorical shape label.
117 pub shape: ShapeLabel,
118}
119
120impl Default for ScoreDistribution {
121 fn default() -> Self {
122 Self {
123 min: 0.0,
124 max: 0.0,
125 median: 0.0,
126 iqr: 0.0,
127 shape: ShapeLabel::InsufficientSamples,
128 }
129 }
130}
131
132/// Compute per-item score quantiles from a ranked list.
133///
134/// Input is assumed ranked **descending** by score (position 0 = top
135/// item). Output preserves input order: `out[i]` is the quantile of
136/// `ranked[i]`, with the top item receiving `1.0` and the bottom
137/// receiving `0.0`.
138///
139/// Formula: `out[i] = (K - 1 - i) / max(K - 1, 1)` where `K = ranked.len()`.
140///
141/// # Scale-freeness
142///
143/// Output depends **only on rank**, never on absolute score values.
144/// Two responses with identical rank order but different score
145/// magnitudes produce identical quantile vectors.
146///
147/// # Edge cases
148///
149/// - `K = 0` -> `[]`.
150/// - `K = 1` -> `[1.0]` (single item is trivially top-ranked).
151///
152/// # Determinism
153///
154/// Pure function of input length. No allocation beyond the output
155/// `Vec<f32>`; no RNG, no global state.
156#[must_use]
157pub fn score_quantiles<N>(ranked: &[(N, f32)]) -> Vec<f32> {
158 let k = ranked.len();
159 if k == 0 {
160 return Vec::new();
161 }
162 if k == 1 {
163 return vec![1.0];
164 }
165 let denom = (k - 1) as f32;
166 (0..k).map(|i| (k - 1 - i) as f32 / denom).collect()
167}
168
169/// Thin wrapper over [`score_quantiles`] keyed on [`NodeId`].
170///
171/// Exists so callers with the canonical `&[(NodeId, f32)]` ranked
172/// shape (from `mnem-core`'s retrieve pipeline) get a typed entry
173/// point; internally forwards to the generic version.
174#[must_use]
175pub fn node_score_quantiles(ranked: &[(NodeId, f32)]) -> Vec<f32> {
176 score_quantiles(ranked)
177}
178
179/// Classify the shape of a score vector.
180///
181/// `scores` is any order of floats; the function copies and sorts
182/// internally. `gate` is the minimum sample count below which the
183/// classifier abstains (emitting [`ShapeLabel::InsufficientSamples`]
184/// rather than a noisy label). Callers usually pass [`K_MIN`].
185///
186/// # Classification rules
187///
188/// Let `sorted` be `scores` ascending, with `n = scores.len()`. Compute
189/// `min = sorted[0]`, `max = sorted[n-1]`, `median = sorted[n/2]`,
190/// `q1 = sorted[n/4]`, `q3 = sorted[3n/4]`,
191/// `iqr = max(q3 - q1, f32::EPSILON)`. Then:
192///
193/// 1. If `max - median > 2 * iqr` -> [`ShapeLabel::LongTail`].
194/// 2. Else if the largest gap between consecutive sorted scores
195/// exceeds `iqr` -> [`ShapeLabel::Bimodal`].
196/// 3. Else -> [`ShapeLabel::Uniform`].
197///
198/// All three thresholds are *relative* (ratios / differences of the
199/// input), so the classification is scale-invariant: multiplying every
200/// score by a positive constant yields an identical [`ShapeLabel`].
201///
202/// # Determinism
203///
204/// Pure function. Sorts with `total_cmp` so NaN inputs land in a
205/// well-defined position rather than poisoning comparisons; the
206/// numeric result for NaN-free inputs is bit-identical across runs.
207#[must_use]
208pub fn distribution_shape(scores: &[f32], gate: usize) -> ScoreDistribution {
209 if scores.len() < gate {
210 return ScoreDistribution::default();
211 }
212 let mut sorted: Vec<f32> = scores.to_vec();
213 sorted.sort_by(f32::total_cmp);
214 let n = sorted.len();
215 let min = sorted[0];
216 let max = sorted[n - 1];
217 let median = sorted[n / 2];
218 let q1 = sorted[n / 4];
219 let q3 = sorted[(3 * n) / 4];
220 let iqr = (q3 - q1).max(f32::EPSILON);
221
222 // Classification rule order matters. We check bimodal *before*
223 // long-tail because a cluster-separated distribution often also
224 // satisfies `max - median > 2 * iqr` (one cluster sits above the
225 // median of the whole vector), but the correct agent hint is
226 // "two peaks" not "one outlier".
227 //
228 // Bimodal criterion: largest consecutive gap is a sizeable
229 // fraction of the full range. Using `(max - min) / 2` keeps the
230 // criterion scale-free (ratio, not magnitude) and robust for
231 // near-degenerate vectors where `iqr` collapses to
232 // `f32::EPSILON`.
233 let range = max - min;
234 let shape = if bimodal_gap(&sorted, range) {
235 ShapeLabel::Bimodal
236 } else if (max - median) > 2.0 * iqr {
237 ShapeLabel::LongTail
238 } else {
239 ShapeLabel::Uniform
240 };
241
242 ScoreDistribution {
243 min,
244 max,
245 median,
246 iqr,
247 shape,
248 }
249}
250
251/// True iff the sorted vector splits into two balanced clusters
252/// separated by a gap larger than half the range.
253///
254/// `sorted` must be ascending. Two conditions together:
255///
256/// 1. The largest consecutive gap `> range * 0.5` (the gap dominates
257/// the span, i.e. a structural separation, not sampling noise).
258/// 2. The gap is **interior**: at least two items on each side. A
259/// gap at position `n-1` (one item above, `n-1` below) is a
260/// single outlier - that's long-tail, not bimodal. Requiring
261/// balanced cluster sizes disambiguates the two shapes cleanly.
262///
263/// Empty / single-item vectors return `false` (caller already gated
264/// on `gate >= 2`). Range `<= epsilon` (all-equal) also returns
265/// `false` so the all-equal case falls through to Uniform.
266fn bimodal_gap(sorted: &[f32], range: f32) -> bool {
267 if sorted.len() < 4 || range <= f32::EPSILON {
268 return false;
269 }
270 let threshold = range * 0.5;
271 // Walk interior windows only (skip first and last position so we
272 // guarantee >=2 items per cluster). `sorted.windows(2)` gives
273 // pairs (a, b) where a = sorted[i], b = sorted[i+1].
274 for (i, w) in sorted.windows(2).enumerate() {
275 // Interior: cluster below has `i+1` items, cluster above has
276 // `n - i - 1` items. Require both >= 2.
277 let below = i + 1;
278 let above = sorted.len() - i - 1;
279 if below < 2 || above < 2 {
280 continue;
281 }
282 let gap = w[1] - w[0];
283 if gap > threshold {
284 return true;
285 }
286 }
287 false
288}
289
290/// Derive the minimum sample size satisfying a Wilson 95% CI width
291/// target.
292///
293/// Formula (worst-case variance at `p = 0.5`):
294///
295/// ```text
296/// K >= (z / eps)^2 * p * (1 - p)
297/// = (1.96 / eps)^2 * 0.25
298/// ```
299///
300/// Scale: tighter targets (smaller `eps`) grow `K` quadratically;
301/// looser targets shrink it. Output is always `>= 1`.
302///
303/// Reference values:
304/// - `width_target = 0.35` -> `K ~= 8` (matches [`K_MIN`] floor).
305/// - `width_target = 0.18` -> `K ~= 30` (tight agent-decision margin).
306/// - `width_target = 0.10` -> `K ~= 97` (statistical-analysis grade).
307///
308/// [`K_MIN`] is a separate **floor constant** for the shape-gate; it
309/// is not literally `derive_k_min(WILSON_WIDTH_TARGET)`. The Wilson
310/// formula here is exposed so callers sizing larger-K experiments
311/// can reason about their desired width explicitly.
312///
313/// # Panics
314///
315/// Does not panic; non-positive / non-finite `width_target` is
316/// clamped to a tiny positive value so the computation cannot
317/// produce `NaN` or `0` divisions.
318#[must_use]
319pub fn derive_k_min(width_target: f32) -> usize {
320 // Guard: clamp pathological inputs to avoid NaN / Inf. A zero or
321 // negative width target is nonsensical (width is non-negative by
322 // definition); we treat it as "as tight as possible" by clamping
323 // to f32::EPSILON, which drives K to a very large number rather
324 // than panicking.
325 let eps = if width_target.is_finite() && width_target > 0.0 {
326 width_target
327 } else {
328 f32::EPSILON
329 };
330 let raw = (WILSON_Z / eps).powi(2) * 0.25;
331 // `ceil` so we never *undershoot* the target width; `max(1)` so
332 // we never return 0 for absurdly loose targets.
333 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
334 let k = raw.ceil() as usize;
335 k.max(1)
336}
337
338#[cfg(test)]
339mod tests {
340 use super::*;
341
342 // ---------- floor-c tunable assertions ----------
343
344 #[test]
345 fn k_min_default_is_8() {
346 // Floor-c constant: K_MIN = 8 is the shape-gate floor below
347 // which IQR / median estimators collapse under noise. This is
348 // a principled *floor*, not literally the Wilson derivation
349 // at the default width target.
350 assert_eq!(K_MIN, 8);
351 // Wilson formula: the K that satisfies width <= 0.35 is the
352 // floor value 8 via (1.96/0.35)^2 * 0.25 = 7.84 -> ceil = 8.
353 // This links the floor to a defensible width target even
354 // though the tighter default (0.18) demands more samples.
355 assert_eq!(derive_k_min(0.35), 8);
356 }
357
358 #[test]
359 fn width_target_tunable_in_principled_range() {
360 // Default width target lies between "loose routing" (0.35,
361 // matches floor K=8) and "tight analysis" (0.10, K~97).
362 assert!((0.10..=0.35).contains(&WILSON_WIDTH_TARGET));
363 // Tighter target (0.10) -> larger K: (1.96/0.10)^2 * 0.25 = 96.04 -> 97.
364 assert_eq!(derive_k_min(0.10), 97);
365 // Default target (0.18) -> K = 30 by the formula.
366 assert_eq!(derive_k_min(WILSON_WIDTH_TARGET), 30);
367 // Looser target (0.30) -> smaller K: (1.96/0.30)^2 * 0.25 = 10.67 -> 11.
368 assert_eq!(derive_k_min(0.30), 11);
369 // Monotone: tighter eps => larger K.
370 assert!(derive_k_min(0.10) > derive_k_min(0.18));
371 assert!(derive_k_min(0.18) > derive_k_min(0.30));
372 }
373
374 #[test]
375 fn derive_k_min_never_zero() {
376 // Absurdly loose target still yields K >= 1.
377 assert!(derive_k_min(100.0) >= 1);
378 // Zero / negative / NaN clamped safely.
379 assert!(derive_k_min(0.0) >= 1);
380 assert!(derive_k_min(-1.0) >= 1);
381 assert!(derive_k_min(f32::NAN) >= 1);
382 }
383
384 // ---------- quantile behaviour ----------
385
386 #[test]
387 fn quantile_monotone() {
388 // Ranked descending: index 0 is top, index K-1 is bottom.
389 // Quantiles should be non-increasing across positions.
390 let ranked: Vec<(u32, f32)> = vec![(0, 0.9), (1, 0.7), (2, 0.5), (3, 0.3), (4, 0.1)];
391 let q = score_quantiles(&ranked);
392 assert_eq!(q, vec![1.0, 0.75, 0.5, 0.25, 0.0]);
393 // Monotonicity: no adjacent pair is increasing.
394 assert!(q.windows(2).all(|w| w[0] >= w[1]));
395 }
396
397 #[test]
398 fn quantile_top_is_one_bottom_is_zero() {
399 let ranked: Vec<(u32, f32)> = (0..10).map(|i| (i, 1.0 - (i as f32) * 0.1)).collect();
400 let q = score_quantiles(&ranked);
401 assert!((q[0] - 1.0).abs() < f32::EPSILON);
402 assert!((q[9] - 0.0).abs() < f32::EPSILON);
403 }
404
405 #[test]
406 fn quantile_edge_cases() {
407 // Empty -> empty.
408 let empty: Vec<(u32, f32)> = vec![];
409 assert!(score_quantiles(&empty).is_empty());
410 // Single item -> [1.0].
411 let one: Vec<(u32, f32)> = vec![(0, 0.42)];
412 assert_eq!(score_quantiles(&one), vec![1.0]);
413 }
414
415 #[test]
416 fn quantile_scale_invariance() {
417 // Scaling scores by any positive constant does NOT change quantiles
418 // (they're pure rank).
419 let ranked_a: Vec<(u32, f32)> = vec![(0, 0.9), (1, 0.5), (2, 0.1)];
420 let ranked_b: Vec<(u32, f32)> = vec![(0, 90.0), (1, 50.0), (2, 10.0)];
421 assert_eq!(score_quantiles(&ranked_a), score_quantiles(&ranked_b));
422 }
423
424 // ---------- shape classification ----------
425
426 #[test]
427 fn shape_long_tail_when_top_score_dominates() {
428 // Top score is far above the pack; max - median > 2 * iqr.
429 // 8 items (meets gate=8); most clustered low, one outlier high.
430 let scores = vec![0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.95];
431 let dist = distribution_shape(&scores, 8);
432 assert_eq!(dist.shape, ShapeLabel::LongTail);
433 assert!((dist.max - 0.95).abs() < 1e-5);
434 assert!(dist.min >= 0.0);
435 }
436
437 #[test]
438 fn shape_uniform() {
439 // Equi-spaced scores; no dominant top, no bimodal gap.
440 let scores: Vec<f32> = (0..10).map(|i| i as f32 * 0.1).collect();
441 let dist = distribution_shape(&scores, K_MIN);
442 assert_eq!(dist.shape, ShapeLabel::Uniform);
443 }
444
445 #[test]
446 fn shape_bimodal() {
447 // Two clusters separated by a gap > iqr.
448 // Low cluster: 0.10..0.14. High cluster: 0.80..0.84. Gap: 0.66.
449 // IQR of 10 samples split 5/5 is small (~0.04), so gap >> iqr.
450 let scores = vec![0.10, 0.11, 0.12, 0.13, 0.14, 0.80, 0.81, 0.82, 0.83, 0.84];
451 let dist = distribution_shape(&scores, K_MIN);
452 assert_eq!(dist.shape, ShapeLabel::Bimodal);
453 }
454
455 #[test]
456 fn shape_insufficient_samples_below_gate() {
457 // K < gate -> default (all zero, InsufficientSamples).
458 let scores = vec![0.1, 0.5, 0.9];
459 let dist = distribution_shape(&scores, K_MIN);
460 assert_eq!(dist.shape, ShapeLabel::InsufficientSamples);
461 assert_eq!(dist.min, 0.0);
462 assert_eq!(dist.max, 0.0);
463 }
464
465 #[test]
466 fn shape_all_equal_is_uniform_not_nan() {
467 // IQR would be 0 but we clamp to f32::EPSILON, so no NaN.
468 // max - median = 0, not > 2 * epsilon -> not long-tail.
469 // Largest gap = 0, not > epsilon -> not bimodal.
470 let scores = vec![0.5_f32; 8];
471 let dist = distribution_shape(&scores, K_MIN);
472 assert_eq!(dist.shape, ShapeLabel::Uniform);
473 assert!(dist.iqr > 0.0); // clamped to epsilon
474 }
475
476 // ---------- scale-free: K=8 and K=1000 both work ----------
477
478 #[test]
479 fn scale_free_across_response_sizes() {
480 // K=8 smallest viable size, K=1000 stress.
481 for &k in &[8_usize, 1000] {
482 let ranked: Vec<(u32, f32)> = (0..k)
483 .map(|i| (i as u32, 1.0 - (i as f32) / (k as f32)))
484 .collect();
485 let q = score_quantiles(&ranked);
486 assert_eq!(q.len(), k);
487 assert!((q[0] - 1.0).abs() < 1e-5);
488 assert!((q[k - 1] - 0.0).abs() < 1e-5);
489
490 let scores: Vec<f32> = ranked.iter().map(|(_, s)| *s).collect();
491 let dist = distribution_shape(&scores, K_MIN);
492 // Equi-spaced -> uniform, regardless of K.
493 assert_eq!(dist.shape, ShapeLabel::Uniform);
494 }
495 }
496
497 // ---------- proptest: determinism ----------
498
499 use proptest::prelude::*;
500
501 proptest! {
502 /// Pure functions must produce bit-identical output for the same
503 /// input across independent runs. Randomised input, checked twice.
504 #[test]
505 fn determinism(
506 xs in proptest::collection::vec(-1000.0_f32..1000.0, 0..200)
507 ) {
508 // Filter NaN so total_cmp behaviour is the only source of
509 // ordering: we assert determinism of the *function*, not a
510 // spec-free NaN-handling path.
511 let xs: Vec<f32> = xs.into_iter().filter(|v| v.is_finite()).collect();
512 let ranked: Vec<(u32, f32)> =
513 xs.iter().enumerate().map(|(i, v)| (i as u32, *v)).collect();
514
515 let q1 = score_quantiles(&ranked);
516 let q2 = score_quantiles(&ranked);
517 prop_assert_eq!(&q1, &q2);
518
519 let d1 = distribution_shape(&xs, K_MIN);
520 let d2 = distribution_shape(&xs, K_MIN);
521 prop_assert_eq!(d1, d2);
522 }
523
524 /// Scale invariance: multiplying every score by a positive
525 /// constant never changes the shape label (it's a pure ratio).
526 #[test]
527 fn shape_scale_invariant(
528 xs in proptest::collection::vec(0.0_f32..100.0, 8..64),
529 scale in 0.01_f32..1000.0,
530 ) {
531 let xs: Vec<f32> = xs.into_iter().filter(|v| v.is_finite()).collect();
532 if xs.len() < K_MIN {
533 return Ok(());
534 }
535 let scaled: Vec<f32> = xs.iter().map(|v| v * scale).collect();
536 let d1 = distribution_shape(&xs, K_MIN);
537 let d2 = distribution_shape(&scaled, K_MIN);
538 prop_assert_eq!(d1.shape, d2.shape);
539 }
540 }
541}