Skip to main content

vela_protocol/
calibration.rs

1//! v0.34: Calibration scoring over resolved predictions.
2//!
3//! A `Prediction` carries the predictor's confidence in the expected
4//! outcome (a number on `[0, 1]`). When a `Resolution` records what
5//! actually happened, the resolver also records `matched_expected`
6//! (a bool). Together those two facts let us compute, per actor:
7//!
8//! - **Hit rate**: fraction of resolved predictions that matched.
9//! - **Brier score**: mean of `(confidence - matched)^2` across the
10//!   resolved subset, where `matched ∈ {0, 1}`. Lower is better.
11//!   Brier = 0 means perfect calibration; 0.25 is a chance-level
12//!   binary predictor; 1.0 is maximally wrong.
13//! - **Log score**: mean of `log(p_assigned_to_actual_outcome)`. We
14//!   clip to `[1e-9, 1 - 1e-9]` to avoid `-∞`. Higher (closer to 0)
15//!   is better.
16//!
17//! These are derived signals — never written to disk, always
18//! recomputed from the canonical `predictions` and `resolutions`
19//! collections. That keeps the kernel ledger source-of-truth and
20//! avoids stale calibration cache concerns.
21//!
22//! Calibration is the move that makes Vela an epistemic ledger
23//! rather than a knowledge graph: every actor accumulates a public,
24//! reproducible track record of how well their stated beliefs match
25//! reality.
26
27use std::collections::HashMap;
28
29use chrono::{DateTime, Utc};
30use serde::{Deserialize, Serialize};
31use serde_json::json;
32
33use crate::bundle::{Prediction, Resolution};
34use crate::events::{self, FindingEventInput, NULL_HASH};
35use crate::project::Project;
36
37/// Per-actor calibration summary computed over the resolved subset of
38/// the actor's predictions.
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct CalibrationRecord {
41    /// Stable actor id (e.g. `reviewer:will-blair`).
42    pub actor: String,
43    /// Total predictions made by this actor in the frontier.
44    pub n_predictions: usize,
45    /// Predictions that have been resolved (have an associated
46    /// `Resolution`). Open predictions don't contribute to scoring.
47    pub n_resolved: usize,
48    /// Resolved predictions whose `matched_expected = true`.
49    pub n_hit: usize,
50    /// v0.40.1: predictions closed by the calibration runtime
51    /// without an explicit `Resolution` (deadline passed). Counted
52    /// separately from `n_resolved` so the predictor still answers
53    /// for the missing commitment without their Brier or log score
54    /// being moved by it.
55    #[serde(default)]
56    pub n_expired: usize,
57    /// Hit rate over resolved (or `None` if `n_resolved == 0`).
58    pub hit_rate: Option<f64>,
59    /// Brier score, lower is better. `None` if no resolutions.
60    pub brier_score: Option<f64>,
61    /// Log score, higher (closer to 0) is better. `None` if no resolutions.
62    pub log_score: Option<f64>,
63    /// Bucketed reliability diagram: for each predicted-confidence
64    /// band, the observed match rate. Empty bands are omitted.
65    /// Format: `(confidence_lower_bound, observed_hit_rate, n_in_band)`.
66    pub reliability_buckets: Vec<(f64, f64, usize)>,
67}
68
69/// Compute calibration records for every actor that has at least one
70/// prediction in the frontier.
71pub fn calibration_records(
72    predictions: &[Prediction],
73    resolutions: &[Resolution],
74) -> Vec<CalibrationRecord> {
75    // Index resolutions by prediction_id for cheap lookup.
76    let mut resolution_by_pred: HashMap<&str, &Resolution> = HashMap::new();
77    for r in resolutions {
78        resolution_by_pred.insert(r.prediction_id.as_str(), r);
79    }
80
81    // Group predictions by actor.
82    let mut by_actor: HashMap<String, Vec<&Prediction>> = HashMap::new();
83    for p in predictions {
84        by_actor.entry(p.made_by.clone()).or_default().push(p);
85    }
86
87    let mut out = Vec::with_capacity(by_actor.len());
88    for (actor, preds) in by_actor {
89        let n_predictions = preds.len();
90        let mut resolved: Vec<(&Prediction, &Resolution)> = Vec::new();
91        for p in &preds {
92            if let Some(r) = resolution_by_pred.get(p.id.as_str()) {
93                resolved.push((p, r));
94            }
95        }
96        let n_resolved = resolved.len();
97        let n_hit = resolved.iter().filter(|(_, r)| r.matched_expected).count();
98        let n_expired = preds.iter().filter(|p| p.expired_unresolved).count();
99        let hit_rate = if n_resolved > 0 {
100            Some(n_hit as f64 / n_resolved as f64)
101        } else {
102            None
103        };
104
105        // Brier: mean of (confidence - matched_int)^2.
106        let brier_score = if n_resolved > 0 {
107            let sum: f64 = resolved
108                .iter()
109                .map(|(p, r)| {
110                    let m = if r.matched_expected { 1.0 } else { 0.0 };
111                    (p.confidence - m).powi(2)
112                })
113                .sum();
114            Some(sum / n_resolved as f64)
115        } else {
116            None
117        };
118
119        // Log score: mean log(p_actual). For matched, p_actual = confidence;
120        // for not matched, p_actual = (1 - confidence). Clipped.
121        let log_score = if n_resolved > 0 {
122            let sum: f64 = resolved
123                .iter()
124                .map(|(p, r)| {
125                    let p_actual = if r.matched_expected {
126                        p.confidence
127                    } else {
128                        1.0 - p.confidence
129                    };
130                    p_actual.clamp(1e-9, 1.0 - 1e-9).ln()
131                })
132                .sum();
133            Some(sum / n_resolved as f64)
134        } else {
135            None
136        };
137
138        // Reliability buckets: 5 bands of width 0.2, omit empty bands.
139        let bands: [(f64, f64); 5] = [(0.0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.001)];
140        let mut reliability_buckets: Vec<(f64, f64, usize)> = Vec::new();
141        for (lo, hi) in bands {
142            let in_band: Vec<&(&Prediction, &Resolution)> = resolved
143                .iter()
144                .filter(|(p, _)| p.confidence >= lo && p.confidence < hi)
145                .collect();
146            if in_band.is_empty() {
147                continue;
148            }
149            let hits = in_band.iter().filter(|(_, r)| r.matched_expected).count();
150            let observed = hits as f64 / in_band.len() as f64;
151            reliability_buckets.push((lo, observed, in_band.len()));
152        }
153
154        out.push(CalibrationRecord {
155            actor,
156            n_predictions,
157            n_resolved,
158            n_hit,
159            n_expired,
160            hit_rate,
161            brier_score,
162            log_score,
163            reliability_buckets,
164        });
165    }
166
167    // Stable order: by actor id alphabetically.
168    out.sort_by(|a, b| a.actor.cmp(&b.actor));
169    out
170}
171
172/// v0.40.1: report from one expiration pass.
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct ExpirationReport {
175    pub now: String,
176    /// IDs of predictions that were already resolved (no action).
177    pub already_resolved: Vec<String>,
178    /// IDs of predictions that were already marked expired before
179    /// this pass (idempotent re-runs).
180    pub already_expired: Vec<String>,
181    /// IDs newly marked expired by this pass.
182    pub newly_expired: Vec<String>,
183    /// Open predictions whose deadline is still in the future
184    /// (or whose deadline is unset).
185    pub still_open: Vec<String>,
186}
187
188/// v0.40.1: walk every prediction in the project and mark as
189/// `expired_unresolved` any whose `resolves_by` is in the past *and*
190/// has no associated `Resolution`. Emits one
191/// `prediction.expired_unresolved` event per newly-expired prediction.
192///
193/// Idempotent: predictions already flagged are surfaced in
194/// `already_expired` rather than re-flagged or duplicated.
195///
196/// `now` is taken as a parameter (not `Utc::now()`) so unit tests can
197/// pin time deterministically. The `predictions expire` CLI passes
198/// the system clock by default but accepts `--now <rfc3339>` for
199/// reproducibility.
200pub fn expire_overdue_predictions(project: &mut Project, now: DateTime<Utc>) -> ExpirationReport {
201    let now_str = now.to_rfc3339();
202    let resolved_ids: std::collections::HashSet<String> = project
203        .resolutions
204        .iter()
205        .map(|r| r.prediction_id.clone())
206        .collect();
207
208    let mut report = ExpirationReport {
209        now: now_str.clone(),
210        already_resolved: Vec::new(),
211        already_expired: Vec::new(),
212        newly_expired: Vec::new(),
213        still_open: Vec::new(),
214    };
215
216    // Take an indexed snapshot to avoid borrow-checker churn against
217    // the mutable findings/events loop below.
218    let mut to_expire: Vec<usize> = Vec::new();
219    for (idx, p) in project.predictions.iter().enumerate() {
220        if resolved_ids.contains(&p.id) {
221            report.already_resolved.push(p.id.clone());
222            continue;
223        }
224        if p.expired_unresolved {
225            report.already_expired.push(p.id.clone());
226            continue;
227        }
228        let Some(deadline_str) = p.resolves_by.as_deref() else {
229            report.still_open.push(p.id.clone());
230            continue;
231        };
232        let Ok(deadline) = DateTime::parse_from_rfc3339(deadline_str) else {
233            // Malformed deadline: treat as still-open rather than
234            // silently expiring. The reviewer can fix the date.
235            report.still_open.push(p.id.clone());
236            continue;
237        };
238        if deadline.with_timezone(&Utc) <= now {
239            to_expire.push(idx);
240        } else {
241            report.still_open.push(p.id.clone());
242        }
243    }
244
245    // Mutate + emit events in a second pass to keep the borrow
246    // checker happy.
247    for idx in to_expire {
248        let pred_id = project.predictions[idx].id.clone();
249        let resolves_by = project.predictions[idx]
250            .resolves_by
251            .clone()
252            .unwrap_or_default();
253        project.predictions[idx].expired_unresolved = true;
254        let reason = format!("deadline {resolves_by} passed without resolution");
255        let event = events::new_finding_event(FindingEventInput {
256            kind: "prediction.expired_unresolved",
257            finding_id: &pred_id,
258            actor_id: "calibration",
259            actor_type: "system",
260            reason: &reason,
261            before_hash: NULL_HASH,
262            after_hash: NULL_HASH,
263            payload: json!({
264                "prediction_id": pred_id,
265                "resolves_by": resolves_by,
266                "expired_at": now_str,
267            }),
268            caveats: Vec::new(),
269        });
270        project.events.push(event);
271        report.newly_expired.push(pred_id);
272    }
273
274    report
275}
276
277/// Convenience: calibration for a single actor.
278pub fn calibration_for_actor(
279    actor: &str,
280    predictions: &[Prediction],
281    resolutions: &[Resolution],
282) -> Option<CalibrationRecord> {
283    calibration_records(predictions, resolutions)
284        .into_iter()
285        .find(|r| r.actor == actor)
286}
287
288#[cfg(test)]
289mod v0_40_1_expiration_tests {
290    use super::*;
291    use crate::bundle::{Conditions, ExpectedOutcome, Prediction};
292    use crate::project;
293
294    fn cond() -> Conditions {
295        Conditions {
296            text: String::new(),
297            species_verified: vec![],
298            species_unverified: vec![],
299            in_vitro: false,
300            in_vivo: false,
301            human_data: false,
302            clinical_trial: false,
303            concentration_range: None,
304            duration: None,
305            age_group: None,
306            cell_type: None,
307        }
308    }
309
310    fn pred(id_seed: &str, resolves_by: Option<&str>) -> Prediction {
311        let mut p = Prediction::new(
312            format!("claim {id_seed}"),
313            vec![],
314            Some("2024-01-01T00:00:00Z".into()),
315            resolves_by.map(|s| s.to_string()),
316            "criterion".to_string(),
317            ExpectedOutcome::Affirmed,
318            "reviewer:test".to_string(),
319            0.7,
320            cond(),
321        );
322        // Ensure unique ids in tests by suffixing the seed.
323        p.id = format!("vpred_test_{id_seed}");
324        p
325    }
326
327    fn empty_project() -> Project {
328        project::assemble("test", vec![], 0, 0, "test")
329    }
330
331    #[test]
332    fn overdue_unresolved_prediction_gets_expired() {
333        let mut project = empty_project();
334        project
335            .predictions
336            .push(pred("a", Some("2025-01-01T00:00:00Z")));
337        let now = DateTime::parse_from_rfc3339("2026-04-27T00:00:00Z")
338            .unwrap()
339            .with_timezone(&Utc);
340        let report = expire_overdue_predictions(&mut project, now);
341        assert_eq!(report.newly_expired.len(), 1);
342        assert!(project.predictions[0].expired_unresolved);
343        // Event was appended.
344        let last = project.events.last().unwrap();
345        assert_eq!(last.kind, "prediction.expired_unresolved");
346    }
347
348    #[test]
349    fn future_deadline_stays_open() {
350        let mut project = empty_project();
351        project
352            .predictions
353            .push(pred("a", Some("2099-01-01T00:00:00Z")));
354        let now = DateTime::parse_from_rfc3339("2026-04-27T00:00:00Z")
355            .unwrap()
356            .with_timezone(&Utc);
357        let report = expire_overdue_predictions(&mut project, now);
358        assert_eq!(report.newly_expired.len(), 0);
359        assert_eq!(report.still_open.len(), 1);
360        assert!(!project.predictions[0].expired_unresolved);
361    }
362
363    #[test]
364    fn unset_deadline_stays_open() {
365        let mut project = empty_project();
366        project.predictions.push(pred("a", None));
367        let now = Utc::now();
368        let report = expire_overdue_predictions(&mut project, now);
369        assert_eq!(report.newly_expired.len(), 0);
370        assert_eq!(report.still_open.len(), 1);
371    }
372
373    #[test]
374    fn already_resolved_prediction_does_not_expire() {
375        let mut project = empty_project();
376        project
377            .predictions
378            .push(pred("a", Some("2025-01-01T00:00:00Z")));
379        let pid = project.predictions[0].id.clone();
380        // Synthesize a resolution.
381        project.resolutions.push(crate::bundle::Resolution {
382            id: "vres_a".into(),
383            prediction_id: pid.clone(),
384            actual_outcome: "yes".into(),
385            matched_expected: true,
386            resolved_at: "2024-12-01T00:00:00Z".into(),
387            resolved_by: "reviewer:test".into(),
388            evidence: crate::bundle::Evidence {
389                evidence_type: "experimental".into(),
390                model_system: String::new(),
391                species: None,
392                method: String::new(),
393                sample_size: None,
394                effect_size: None,
395                p_value: None,
396                replicated: false,
397                replication_count: None,
398                evidence_spans: vec![],
399            },
400            confidence: 1.0,
401        });
402        let now = DateTime::parse_from_rfc3339("2026-04-27T00:00:00Z")
403            .unwrap()
404            .with_timezone(&Utc);
405        let report = expire_overdue_predictions(&mut project, now);
406        assert_eq!(report.newly_expired.len(), 0);
407        assert_eq!(report.already_resolved.len(), 1);
408        assert!(!project.predictions[0].expired_unresolved);
409    }
410
411    #[test]
412    fn idempotent_re_run_lists_already_expired() {
413        let mut project = empty_project();
414        project
415            .predictions
416            .push(pred("a", Some("2025-01-01T00:00:00Z")));
417        let now = DateTime::parse_from_rfc3339("2026-04-27T00:00:00Z")
418            .unwrap()
419            .with_timezone(&Utc);
420        let _ = expire_overdue_predictions(&mut project, now);
421        let report2 = expire_overdue_predictions(&mut project, now);
422        assert_eq!(report2.newly_expired.len(), 0);
423        assert_eq!(report2.already_expired.len(), 1);
424        // No second event should have been appended.
425        let count = project
426            .events
427            .iter()
428            .filter(|e| e.kind == "prediction.expired_unresolved")
429            .count();
430        assert_eq!(count, 1);
431    }
432
433    #[test]
434    fn calibration_record_carries_n_expired() {
435        let mut project = empty_project();
436        let mut p = pred("a", Some("2025-01-01T00:00:00Z"));
437        p.expired_unresolved = true;
438        project.predictions.push(p);
439        let records = calibration_records(&project.predictions, &project.resolutions);
440        assert_eq!(records.len(), 1);
441        assert_eq!(records[0].n_expired, 1);
442        assert_eq!(records[0].n_resolved, 0);
443    }
444}