codetether_agent/session/
eval.rs

1//! Pareto evaluation harness for [`DerivePolicy`].
2//!
3//! ## Role (Phase B step 23)
4//!
5//! Liu et al. (arXiv:2512.22087), Lu et al. (arXiv:2510.06727), and
6//! ClawVM (arXiv:2604.10352) agree on one methodological point: a
7//! policy is never flipped on aggregate accuracy alone. Pareto over
8//! `(accuracy, context-cost, reuse-rate, oracle-gap)` is the minimum.
9//! This module is the *comparator* half of that evaluation loop. The
10//! *runner* half — actually executing [`derive_with_policy`] over a
11//! fixture corpus and collecting [`PolicyRunResult`]s — is a
12//! follow-up commit.
13//!
14//! ## Scope
15//!
16//! * [`PolicyRunResult`] — per-policy datapoint.
17//! * [`pareto_frontier`] — drop dominated points, keep frontier.
18//! * [`reuse_rate`] — Cognitive-Workspace-style warm-hit proxy.
19//!
20//! The ClawVM Tier-1 fault regression suite is a sibling concern;
21//! those tests live as `#[test]` functions on the subsystems they
22//! exercise (compression, journal, session_recall) rather than as
23//! harness inputs.
24//!
25//! ## Examples
26//!
27//! ```rust
28//! use codetether_agent::session::eval::{PolicyRunResult, pareto_frontier, reuse_rate};
29//!
30//! let results = vec![
31//!     PolicyRunResult {
32//!         policy: "legacy",
33//!         kept_messages: 30,
34//!         context_tokens: 24_000,
35//!         fault_count: 12,
36//!         oracle_gap: 4,
37//!         reuse_rate: 0.50,
38//!     },
39//!     PolicyRunResult {
40//!         policy: "reset",
41//!         kept_messages: 8,
42//!         context_tokens: 6_500,
43//!         fault_count: 5,
44//!         oracle_gap: 2,
45//!         reuse_rate: 0.62,
46//!     },
47//! ];
48//! let frontier = pareto_frontier(&results);
49//! assert!(frontier.iter().any(|r| r.policy == "reset"));
50//!
51//! // 4 of 8 context entries were already warm from the prior turn.
52//! assert!((reuse_rate(&[4, 8]) - 0.5).abs() < 1e-9);
53//! ```
54
55/// One Pareto sample for a single derivation policy run.
56///
57/// Lower is better on [`Self::context_tokens`], [`Self::fault_count`],
58/// and [`Self::oracle_gap`]. Higher is better on [`Self::reuse_rate`].
59/// [`Self::kept_messages`] is neutral (informational).
60#[derive(Debug, Clone, Copy, PartialEq)]
61pub struct PolicyRunResult {
62    /// Policy identifier (matches
63    /// [`DerivePolicy::kind`](super::derive_policy::DerivePolicy::kind)).
64    pub policy: &'static str,
65    /// Size of the derived context in messages.
66    pub kept_messages: usize,
67    /// Estimated token cost of the derived context.
68    pub context_tokens: usize,
69    /// Explicit faults raised during the run (ClawVM Tier-1).
70    pub fault_count: usize,
71    /// Online-minus-oracle fault delta from [`replay_oracle`].
72    ///
73    /// [`replay_oracle`]: super::oracle::replay_oracle
74    pub oracle_gap: usize,
75    /// Fraction of derived-context entries that were already present
76    /// in the previous turn's derived context (Cognitive-Workspace
77    /// warm-hit proxy). `0.0 ≤ reuse_rate ≤ 1.0`.
78    pub reuse_rate: f64,
79}
80
81impl PolicyRunResult {
82    /// Returns `true` when `other` Pareto-dominates `self` — that is,
83    /// `other` is at least as good on every axis and strictly better
84    /// on at least one.
85    pub fn is_dominated_by(&self, other: &PolicyRunResult) -> bool {
86        let better_or_equal_on_all = other.context_tokens <= self.context_tokens
87            && other.fault_count <= self.fault_count
88            && other.oracle_gap <= self.oracle_gap
89            && other.reuse_rate >= self.reuse_rate;
90        let strictly_better_on_one = other.context_tokens < self.context_tokens
91            || other.fault_count < self.fault_count
92            || other.oracle_gap < self.oracle_gap
93            || other.reuse_rate > self.reuse_rate;
94        better_or_equal_on_all && strictly_better_on_one
95    }
96}
97
98/// Drop dominated points and return references to the frontier.
99///
100/// Stable in input order among surviving points so downstream callers
101/// can render the frontier with policy labels intact.
102pub fn pareto_frontier(results: &[PolicyRunResult]) -> Vec<&PolicyRunResult> {
103    results
104        .iter()
105        .filter(|candidate| {
106            !results
107                .iter()
108                .any(|other| !std::ptr::eq(*candidate, other) && candidate.is_dominated_by(other))
109        })
110        .collect()
111}
112
113/// Compute the reuse rate from `(warm_hits, total)`.
114///
115/// Returns `0.0` when `total == 0` to avoid NaN so downstream
116/// aggregations (averages, min/max, Pareto comparisons) stay clean.
117pub fn reuse_rate(counts: &[usize; 2]) -> f64 {
118    let [warm, total] = *counts;
119    if total == 0 {
120        return 0.0;
121    }
122    warm as f64 / total as f64
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    fn sample(
130        policy: &'static str,
131        tokens: usize,
132        faults: usize,
133        gap: usize,
134        reuse: f64,
135    ) -> PolicyRunResult {
136        PolicyRunResult {
137            policy,
138            kept_messages: 0,
139            context_tokens: tokens,
140            fault_count: faults,
141            oracle_gap: gap,
142            reuse_rate: reuse,
143        }
144    }
145
146    #[test]
147    fn strictly_better_on_all_axes_dominates() {
148        let a = sample("a", 1000, 5, 2, 0.5);
149        let b = sample("b", 500, 1, 0, 0.9);
150        assert!(a.is_dominated_by(&b));
151        assert!(!b.is_dominated_by(&a));
152    }
153
154    #[test]
155    fn tied_on_all_does_not_dominate() {
156        let a = sample("a", 1000, 5, 2, 0.5);
157        let b = sample("b", 1000, 5, 2, 0.5);
158        assert!(!a.is_dominated_by(&b));
159        assert!(!b.is_dominated_by(&a));
160    }
161
162    #[test]
163    fn pareto_frontier_keeps_nondominated_points() {
164        let results = vec![
165            sample("legacy", 24_000, 12, 4, 0.50),
166            sample("reset", 6_500, 5, 2, 0.62),
167            sample("dominated", 30_000, 20, 10, 0.10),
168        ];
169        let frontier = pareto_frontier(&results);
170        let labels: Vec<&str> = frontier.iter().map(|r| r.policy).collect();
171        assert!(labels.contains(&"reset"));
172        assert!(!labels.contains(&"dominated"));
173    }
174
175    #[test]
176    fn reuse_rate_is_zero_when_total_is_zero() {
177        assert_eq!(reuse_rate(&[0, 0]), 0.0);
178    }
179
180    #[test]
181    fn reuse_rate_round_trips_typical_values() {
182        assert!((reuse_rate(&[7, 10]) - 0.7).abs() < 1e-9);
183        assert!((reuse_rate(&[10, 10]) - 1.0).abs() < 1e-9);
184    }
185}
codetether_agent/session/eval.rs

codetether_agent/session/
eval.rs