codetether_agent/session/eval.rs
1//! Pareto evaluation harness for [`DerivePolicy`].
2//!
3//! ## Role (Phase B step 23)
4//!
5//! Liu et al. (arXiv:2512.22087), Lu et al. (arXiv:2510.06727), and
6//! ClawVM (arXiv:2604.10352) agree on one methodological point: a
7//! policy is never flipped on aggregate accuracy alone. Pareto over
8//! `(accuracy, context-cost, reuse-rate, oracle-gap)` is the minimum.
9//! This module is the *comparator* half of that evaluation loop. The
10//! *runner* half — actually executing [`derive_with_policy`] over a
11//! fixture corpus and collecting [`PolicyRunResult`]s — is a
12//! follow-up commit.
13//!
14//! ## Scope
15//!
16//! * [`PolicyRunResult`] — per-policy datapoint.
17//! * [`pareto_frontier`] — drop dominated points, keep frontier.
18//! * [`reuse_rate`] — Cognitive-Workspace-style warm-hit proxy.
19//!
20//! The ClawVM Tier-1 fault regression suite is a sibling concern;
21//! those tests live as `#[test]` functions on the subsystems they
22//! exercise (compression, journal, session_recall) rather than as
23//! harness inputs.
24//!
25//! ## Examples
26//!
27//! ```rust
28//! use codetether_agent::session::eval::{PolicyRunResult, pareto_frontier, reuse_rate};
29//!
30//! let results = vec![
31//! PolicyRunResult {
32//! policy: "legacy",
33//! kept_messages: 30,
34//! context_tokens: 24_000,
35//! fault_count: 12,
36//! oracle_gap: 4,
37//! reuse_rate: 0.50,
38//! },
39//! PolicyRunResult {
40//! policy: "reset",
41//! kept_messages: 8,
42//! context_tokens: 6_500,
43//! fault_count: 5,
44//! oracle_gap: 2,
45//! reuse_rate: 0.62,
46//! },
47//! ];
48//! let frontier = pareto_frontier(&results);
49//! assert!(frontier.iter().any(|r| r.policy == "reset"));
50//!
51//! // 4 of 8 context entries were already warm from the prior turn.
52//! assert!((reuse_rate(&[4, 8]) - 0.5).abs() < 1e-9);
53//! ```
54
55/// One Pareto sample for a single derivation policy run.
56///
57/// Lower is better on [`Self::context_tokens`], [`Self::fault_count`],
58/// and [`Self::oracle_gap`]. Higher is better on [`Self::reuse_rate`].
59/// [`Self::kept_messages`] is neutral (informational).
60#[derive(Debug, Clone, Copy, PartialEq)]
61pub struct PolicyRunResult {
62 /// Policy identifier (matches
63 /// [`DerivePolicy::kind`](super::derive_policy::DerivePolicy::kind)).
64 pub policy: &'static str,
65 /// Size of the derived context in messages.
66 pub kept_messages: usize,
67 /// Estimated token cost of the derived context.
68 pub context_tokens: usize,
69 /// Explicit faults raised during the run (ClawVM Tier-1).
70 pub fault_count: usize,
71 /// Online-minus-oracle fault delta from [`replay_oracle`].
72 ///
73 /// [`replay_oracle`]: super::oracle::replay_oracle
74 pub oracle_gap: usize,
75 /// Fraction of derived-context entries that were already present
76 /// in the previous turn's derived context (Cognitive-Workspace
77 /// warm-hit proxy). `0.0 ≤ reuse_rate ≤ 1.0`.
78 pub reuse_rate: f64,
79}
80
81impl PolicyRunResult {
82 /// Returns `true` when `other` Pareto-dominates `self` — that is,
83 /// `other` is at least as good on every axis and strictly better
84 /// on at least one.
85 pub fn is_dominated_by(&self, other: &PolicyRunResult) -> bool {
86 let better_or_equal_on_all = other.context_tokens <= self.context_tokens
87 && other.fault_count <= self.fault_count
88 && other.oracle_gap <= self.oracle_gap
89 && other.reuse_rate >= self.reuse_rate;
90 let strictly_better_on_one = other.context_tokens < self.context_tokens
91 || other.fault_count < self.fault_count
92 || other.oracle_gap < self.oracle_gap
93 || other.reuse_rate > self.reuse_rate;
94 better_or_equal_on_all && strictly_better_on_one
95 }
96}
97
98/// Drop dominated points and return references to the frontier.
99///
100/// Stable in input order among surviving points so downstream callers
101/// can render the frontier with policy labels intact.
102pub fn pareto_frontier(results: &[PolicyRunResult]) -> Vec<&PolicyRunResult> {
103 results
104 .iter()
105 .filter(|candidate| {
106 !results
107 .iter()
108 .any(|other| !std::ptr::eq(*candidate, other) && candidate.is_dominated_by(other))
109 })
110 .collect()
111}
112
113/// Compute the reuse rate from `(warm_hits, total)`.
114///
115/// Returns `0.0` when `total == 0` to avoid NaN so downstream
116/// aggregations (averages, min/max, Pareto comparisons) stay clean.
117pub fn reuse_rate(counts: &[usize; 2]) -> f64 {
118 let [warm, total] = *counts;
119 if total == 0 {
120 return 0.0;
121 }
122 warm as f64 / total as f64
123}
124
125#[cfg(test)]
126mod tests {
127 use super::*;
128
129 fn sample(
130 policy: &'static str,
131 tokens: usize,
132 faults: usize,
133 gap: usize,
134 reuse: f64,
135 ) -> PolicyRunResult {
136 PolicyRunResult {
137 policy,
138 kept_messages: 0,
139 context_tokens: tokens,
140 fault_count: faults,
141 oracle_gap: gap,
142 reuse_rate: reuse,
143 }
144 }
145
146 #[test]
147 fn strictly_better_on_all_axes_dominates() {
148 let a = sample("a", 1000, 5, 2, 0.5);
149 let b = sample("b", 500, 1, 0, 0.9);
150 assert!(a.is_dominated_by(&b));
151 assert!(!b.is_dominated_by(&a));
152 }
153
154 #[test]
155 fn tied_on_all_does_not_dominate() {
156 let a = sample("a", 1000, 5, 2, 0.5);
157 let b = sample("b", 1000, 5, 2, 0.5);
158 assert!(!a.is_dominated_by(&b));
159 assert!(!b.is_dominated_by(&a));
160 }
161
162 #[test]
163 fn pareto_frontier_keeps_nondominated_points() {
164 let results = vec![
165 sample("legacy", 24_000, 12, 4, 0.50),
166 sample("reset", 6_500, 5, 2, 0.62),
167 sample("dominated", 30_000, 20, 10, 0.10),
168 ];
169 let frontier = pareto_frontier(&results);
170 let labels: Vec<&str> = frontier.iter().map(|r| r.policy).collect();
171 assert!(labels.contains(&"reset"));
172 assert!(!labels.contains(&"dominated"));
173 }
174
175 #[test]
176 fn reuse_rate_is_zero_when_total_is_zero() {
177 assert_eq!(reuse_rate(&[0, 0]), 0.0);
178 }
179
180 #[test]
181 fn reuse_rate_round_trips_typical_values() {
182 assert!((reuse_rate(&[7, 10]) - 0.7).abs() < 1e-9);
183 assert!((reuse_rate(&[10, 10]) - 1.0).abs() < 1e-9);
184 }
185}