Skip to main content

sharpebench_core/
composite.rs

1//! The composite score + leaderboard ranking — where the gates compose.
2//!
3//! An agent ranks **only if** every gate holds:
4//! 1. its pooled Deflated Sharpe clears `dsr_bar` (survives multiple-testing),
5//! 2. it passes the per-run bar on *every* seed×window (`pass^k`, mode All),
6//! 3. it has zero block-severity process violations in any run,
7//! 4. its bootstrap p-value beats `alpha` (the edge isn't noise).
8//!
9//! Raw mean return is recorded but is **never** the rank key — that is the whole
10//! point of SharpeBench. Run the included synthetic agents (see tests) to watch a
11//! lucky agent with a higher raw return get demoted below a skilled one.
12
13use serde::{Deserialize, Serialize};
14
15use crate::calibration::brier_score;
16use crate::decay::edge_half_life;
17use crate::deflated_sharpe::{deflated_sharpe_ratio, probabilistic_sharpe_ratio};
18use crate::pass_k::{pass_k, PassMode};
19use crate::percentile::percentile_of;
20use crate::process::{process_score, ProcessEvent, Trace};
21use crate::rolling::rolling_sharpe;
22use crate::selection::{selection_robustness, SelectionRobustness};
23use crate::significance::bootstrap_pvalue;
24use crate::stats::mean;
25
26/// One seed×window run of an agent: its per-period returns plus the decision
27/// trace and (optionally) per-decision confidences/outcomes.
28#[derive(Clone, Debug, Serialize, Deserialize)]
29pub struct Run {
30    pub returns: Vec<f64>,
31    #[serde(default)]
32    pub trace: Trace,
33    #[serde(default)]
34    pub confidences: Vec<f64>,
35    #[serde(default)]
36    pub outcomes: Vec<bool>,
37    /// Compute/token cost incurred to produce this run (any consistent unit).
38    /// Used for cost-efficiency reporting; 0.0 = not reported.
39    #[serde(default)]
40    pub cost: f64,
41}
42
43/// An agent's full submission: many runs across seeds × windows.
44#[derive(Clone, Debug, Serialize, Deserialize)]
45pub struct AgentSubmission {
46    pub agent_id: String,
47    pub runs: Vec<Run>,
48    /// Number of in-sample backtests/configs the agent searched before submitting.
49    /// Folded into the deflation trial footprint so over-searching faces a higher
50    /// bar — records data-snooping up front. 0 = undeclared.
51    #[serde(default)]
52    pub in_sample_trials: u32,
53    /// Optional alternative candidate strategies the agent considered, each a
54    /// pooled return stream. Used for selection-robustness reporting (best vs
55    /// median candidate). Empty = not reported.
56    #[serde(default)]
57    pub candidates: Vec<Vec<f64>>,
58}
59
60/// What to rank eligible agents by.
61#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, Default)]
62#[serde(rename_all = "snake_case")]
63pub enum RankKey {
64    /// Deflated Sharpe (the default — luck-robust risk-adjusted skill).
65    #[default]
66    DeflatedSharpe,
67    /// Alpha (skill net of market beta).
68    Alpha,
69}
70
71/// A trading mandate: constraints the agent must respect to be rank-eligible.
72#[derive(Clone, Debug, Serialize, Deserialize)]
73pub struct Mandate {
74    /// Max tolerable drawdown over the pooled track (e.g. 0.20). 1.0 = unconstrained.
75    pub max_drawdown: f64,
76}
77
78impl Default for Mandate {
79    fn default() -> Self {
80        Self { max_drawdown: 1.0 }
81    }
82}
83
84/// Maximum drawdown of the equity curve implied by a return series, in [0, 1].
85fn max_drawdown(returns: &[f64]) -> f64 {
86    let mut nav = 1.0;
87    let mut peak = 1.0;
88    let mut mdd = 0.0;
89    for &r in returns {
90        nav *= 1.0 + r;
91        if nav > peak {
92            peak = nav;
93        }
94        if peak > 0.0 {
95            let dd = 1.0 - nav / peak;
96            if dd > mdd {
97                mdd = dd;
98            }
99        }
100    }
101    mdd
102}
103
104/// Scoring configuration. `n_trials` / `trials_sr_std` are the multiple-testing
105/// footprint used for deflation (typically: how many agents/configs were tried).
106#[derive(Clone, Debug, Serialize, Deserialize)]
107pub struct ScoreConfig {
108    pub n_trials: u32,
109    pub trials_sr_std: f64,
110    /// Deflated-Sharpe bar an agent must clear to be rank-eligible (e.g. 0.95).
111    pub dsr_bar: f64,
112    /// Per-run PSR bar each individual run must clear for pass^k.
113    pub per_run_psr_bar: f64,
114    /// Significance level for the bootstrap edge test.
115    pub alpha: f64,
116    pub bootstrap_seed: u64,
117    pub n_boot: usize,
118    pub block_prob: f64,
119    /// Mandate constraints the agent must respect (default: unconstrained).
120    #[serde(default)]
121    pub mandate: Mandate,
122    /// What eligible agents are ranked by (default: deflated Sharpe).
123    #[serde(default)]
124    pub rank_key: RankKey,
125    /// Frozen reference population of Deflated-Sharpe values (e.g. real fund or
126    /// human track records) for percentile reporting. Empty = no percentile.
127    #[serde(default)]
128    pub reference_dsr_population: Vec<f64>,
129    /// Window length (in periods) for the rolling-Sharpe stability report over the
130    /// pooled track — worst-window Sharpe + fraction-of-positive-windows.
131    #[serde(default = "default_rolling_window")]
132    pub rolling_window: usize,
133}
134
135/// Default rolling-Sharpe window length (21 periods ≈ one trading month).
136fn default_rolling_window() -> usize {
137    21
138}
139
140impl Default for ScoreConfig {
141    fn default() -> Self {
142        Self {
143            n_trials: 50,
144            trials_sr_std: 0.5,
145            dsr_bar: 0.95,
146            per_run_psr_bar: 0.90,
147            alpha: 0.05,
148            bootstrap_seed: 0x5BA7_2026,
149            n_boot: 2000,
150            block_prob: 0.1,
151            mandate: Mandate::default(),
152            rank_key: RankKey::default(),
153            reference_dsr_population: Vec::new(),
154            rolling_window: default_rolling_window(),
155        }
156    }
157}
158
159/// The scored result for one agent.
160#[derive(Clone, Debug, Serialize, Deserialize)]
161pub struct CompositeScore {
162    pub agent_id: String,
163    pub deflated_sharpe: f64,
164    pub psr: f64,
165    pub passed_k: bool,
166    pub process_ok: bool,
167    pub bootstrap_p: f64,
168    pub raw_mean_return: f64,
169    pub rank_eligible: bool,
170    /// The ranking key: the deflated Sharpe when eligible, else 0.0.
171    pub composite: f64,
172    /// Field-relative attribution, filled by [`rank`]: the skill (alpha) and
173    /// market-beta components of the agent's return. Zero from `score_agent` alone.
174    pub alpha: f64,
175    pub beta: f64,
176    /// Calibration of stated confidence (Brier score; lower = better). `None` if
177    /// the agent reported no confidences/outcomes.
178    pub calibration_brier: Option<f64>,
179    /// Edge durability: half-life (in runs) of the per-run edge. `None` if there
180    /// are too few runs or the edge isn't decaying.
181    pub edge_half_life: Option<f64>,
182    /// Field-wide data-snooping p-value (White's Reality Check), filled by [`rank`]:
183    /// the probability the *leader's* edge is luck given how many agents were tried.
184    /// Same value across the field. 1.0 from `score_agent` alone.
185    pub field_reality_check_p: f64,
186    /// Maximum drawdown over the pooled track, in [0, 1].
187    pub max_drawdown: f64,
188    /// Whether the agent respected its mandate (e.g. the drawdown cap).
189    pub mandate_ok: bool,
190    /// Turnover proxy: average orders placed per run (trading frequency / capacity).
191    pub turnover: f64,
192    /// Whether the agent is on the Pareto front over (return↑, drawdown↓,
193    /// turnover↓). Filled by [`rank`].
194    pub pareto_optimal: bool,
195    /// Whether the agent's outperformance survives Romano–Wolf step-down multiple
196    /// testing across the field. Filled by [`rank`].
197    pub step_down_significant: bool,
198    /// Conviction-weighted return: each run's return weighted by the confidence the
199    /// agent staked on it. Rewards sizing conviction with the outcome. Falls back to
200    /// the raw mean when no confidences are reported.
201    pub confidence_weighted_return: f64,
202    /// Total compute/token cost across all runs (0.0 if unreported).
203    pub cost: f64,
204    /// Raw mean return per unit cost — skill-per-dollar. `None` when cost is unreported.
205    pub return_per_cost: Option<f64>,
206    /// Hansen's studentized SPA p-value for the field leader (a more robust
207    /// sibling of `field_reality_check_p`). Same value across the field; filled by
208    /// [`rank`]. 1.0 from `score_agent` alone.
209    pub field_spa_p: f64,
210    /// Hansen's *consistent* SPA p-value — the most powerful of the field-wide
211    /// data-snooping tests (drops clearly-bad models from the null). Same value
212    /// across the field; filled by [`rank`]. 1.0 from `score_agent` alone.
213    pub field_spa_consistent_p: f64,
214    /// Crowdedness: the agent's mean Pearson correlation with the rest of the
215    /// field's return streams, in [-1, 1]. High = riding the same factor as
216    /// everyone else (a common beta that decays for the whole board at once);
217    /// low/negative = diversifying. Reported, not gating; filled by [`rank`].
218    /// `None` from `score_agent` alone (no field context) or with < 2 agents.
219    pub field_crowdedness: Option<f64>,
220    /// In-sample search budget the agent declared (configs tried before submission).
221    pub in_sample_trials: u32,
222    /// Effective deflation trial footprint = `cfg.n_trials + in_sample_trials`; the
223    /// Deflated Sharpe is computed against this, so over-searching raises the bar.
224    pub effective_n_trials: u32,
225    /// Percentile (0..=100) of the Deflated Sharpe within the frozen reference
226    /// population. `None` when no reference population is configured.
227    pub dsr_percentile: Option<f64>,
228    /// Deflated Sharpe of the median submitted candidate. `None` if none reported.
229    pub selection_median_dsr: Option<f64>,
230    /// Best-minus-median candidate Deflated Sharpe — the selection-luck gap.
231    /// `None` if no candidates were reported.
232    pub selection_gap: Option<f64>,
233    /// 1-based ordinal position among rank-eligible agents (scale-invariant rank
234    /// mode). 0 = ineligible or scored outside a field. Filled by [`rank`].
235    pub rank_ordinal: usize,
236    /// Worst (minimum) per-window Sharpe over the pooled track (non-annualized),
237    /// using `cfg.rolling_window`. Low/negative = the edge collapses in some
238    /// stretch. `None` when the pooled track is shorter than one window.
239    pub rolling_min_sharpe: Option<f64>,
240    /// Fraction of rolling windows whose Sharpe is positive, in [0, 1]. Near 1 =
241    /// the edge is everywhere; low = the deflated edge lives in a few lucky
242    /// windows. `None` when the track is too short.
243    pub rolling_frac_positive: Option<f64>,
244    /// Sortino ratio over the pooled track (excess mean return per unit of
245    /// *downside* deviation, MAR = 0): rewards an edge that doesn't arrive with
246    /// downside churn. Reported, never the rank key. `None` with no downside.
247    pub sortino: Option<f64>,
248    /// Downside deviation (RMS of below-target returns) — the denominator of
249    /// `sortino`, reported so the figure is legible.
250    pub downside_deviation: f64,
251    /// Budget-normalized Deflated Sharpe: `deflated_sharpe / cost` — luck-robust
252    /// skill per unit of compute/token spend. `None` when cost is unreported.
253    pub dsr_per_cost: Option<f64>,
254    /// Whether the realized return was floored to a no-skill baseline because the
255    /// agent has a block-severity process violation (cheating shouldn't pay).
256    pub process_floored: bool,
257    /// The agent's realized return after the process floor: its raw mean when the
258    /// process is clean, else the no-skill baseline (0.0). Always reported
259    /// alongside `raw_mean_return`, which keeps the un-floored value.
260    pub realized_floored_return: f64,
261}
262
263/// Pareto dominance on (return↑, drawdown↓, turnover↓).
264fn dominates(a: &CompositeScore, b: &CompositeScore) -> bool {
265    a.raw_mean_return >= b.raw_mean_return
266        && a.max_drawdown <= b.max_drawdown
267        && a.turnover <= b.turnover
268        && (a.raw_mean_return > b.raw_mean_return
269            || a.max_drawdown < b.max_drawdown
270            || a.turnover < b.turnover)
271}
272
273/// Score a single agent submission against `cfg`.
274pub fn score_agent(sub: &AgentSubmission, cfg: &ScoreConfig) -> CompositeScore {
275    let pooled: Vec<f64> = sub
276        .runs
277        .iter()
278        .flat_map(|r| r.returns.iter().copied())
279        .collect();
280
281    let psr = probabilistic_sharpe_ratio(&pooled, 0.0);
282    // Fold the agent's declared in-sample search budget into the deflation trial
283    // footprint: an agent that tried 5000 configs to find this strategy faces a
284    // higher bar than one that tried none (front-end data-snooping control).
285    let effective_n_trials = cfg.n_trials.saturating_add(sub.in_sample_trials);
286    let dsr = deflated_sharpe_ratio(&pooled, effective_n_trials, cfg.trials_sr_std);
287
288    // pass^k: every run must individually clear the per-run PSR bar.
289    let per_run: Vec<bool> = sub
290        .runs
291        .iter()
292        .map(|r| probabilistic_sharpe_ratio(&r.returns, 0.0) >= cfg.per_run_psr_bar)
293        .collect();
294    let passed_k = pass_k(&per_run, PassMode::All);
295
296    // process: a single block-severity violation in any run is disqualifying.
297    let process_ok = sub.runs.iter().all(|r| process_score(&r.trace).is_clean());
298
299    let bootstrap_p = bootstrap_pvalue(&pooled, cfg.bootstrap_seed, cfg.n_boot, cfg.block_prob);
300    let raw_mean_return = mean(&pooled);
301
302    // Calibration: does stated conviction predict outcomes? (None if not reported.)
303    let conf: Vec<f64> = sub
304        .runs
305        .iter()
306        .flat_map(|r| r.confidences.iter().copied())
307        .collect();
308    let outc: Vec<bool> = sub
309        .runs
310        .iter()
311        .flat_map(|r| r.outcomes.iter().copied())
312        .collect();
313    let calibration_brier = if !conf.is_empty() && !outc.is_empty() {
314        Some(brier_score(&conf, &outc))
315    } else {
316        None
317    };
318
319    // Edge durability: half-life of the per-run edge across runs.
320    let per_run_edge: Vec<f64> = sub.runs.iter().map(|r| mean(&r.returns)).collect();
321    let edge_half_life_periods = edge_half_life(&per_run_edge);
322
323    // Mandate adherence: does the drawdown respect the mandate's cap?
324    let mdd = max_drawdown(&pooled);
325    let mandate_ok = mdd <= cfg.mandate.max_drawdown;
326
327    // Turnover proxy: average number of orders placed per run.
328    let total_orders: usize = sub
329        .runs
330        .iter()
331        .map(|r| {
332            r.trace
333                .events
334                .iter()
335                .filter(|e| matches!(e, ProcessEvent::OrderPlaced { .. }))
336                .count()
337        })
338        .sum();
339    let turnover = total_orders as f64 / sub.runs.len().max(1) as f64;
340
341    // Confidence-weighted return: weight each run's return by the conviction
342    // staked on it, so sizing-with-conviction beats flat-confidence trading.
343    let mut cw_num = 0.0;
344    let mut cw_den = 0.0;
345    for r in &sub.runs {
346        let w = if r.confidences.is_empty() {
347            1.0
348        } else {
349            mean(&r.confidences)
350        };
351        cw_num += w * mean(&r.returns);
352        cw_den += w;
353    }
354    let confidence_weighted_return = if cw_den > 0.0 {
355        cw_num / cw_den
356    } else {
357        raw_mean_return
358    };
359
360    // Cost-efficiency: skill per unit of compute/token spend.
361    let cost: f64 = sub.runs.iter().map(|r| r.cost).sum();
362    let return_per_cost = if cost > 0.0 {
363        Some(raw_mean_return / cost)
364    } else {
365        None
366    };
367
368    // Legibility: percentile of the Deflated Sharpe within the frozen reference
369    // population (e.g. real fund track records). None when unconfigured.
370    let dsr_percentile = if cfg.reference_dsr_population.is_empty() {
371        None
372    } else {
373        Some(percentile_of(dsr, &cfg.reference_dsr_population))
374    };
375
376    // Selection-axis luck: best vs median Deflated Sharpe of the agent's candidate
377    // strategies, deflated against the same effective trial footprint. A large gap
378    // means the headline result is a lucky pick, not a robust family of edges.
379    let (selection_median_dsr, selection_gap) = if sub.candidates.is_empty() {
380        (None, None)
381    } else {
382        let sr: SelectionRobustness =
383            selection_robustness(&sub.candidates, effective_n_trials, cfg.trials_sr_std);
384        (Some(sr.median_dsr), Some(sr.selection_gap))
385    };
386
387    // Rolling-Sharpe stability over the pooled track: is the deflated edge one
388    // lucky window, or present across the whole track?
389    let rolling = rolling_sharpe(&pooled, cfg.rolling_window);
390    let rolling_min_sharpe = rolling.map(|r| r.min_sharpe);
391    let rolling_frac_positive = rolling.map(|r| r.frac_positive);
392
393    // Downside-risk view: the Sortino rewards an edge that doesn't arrive with
394    // downside volatility (reported alongside the Sharpe family, never a gate).
395    let sortino = crate::stats::sortino_ratio(&pooled, 0.0);
396    let downside_deviation = crate::stats::downside_deviation(&pooled, 0.0);
397
398    // Budget-normalized Deflated Sharpe: luck-robust skill per unit of spend.
399    let dsr_per_cost = if cost > 0.0 { Some(dsr / cost) } else { None };
400
401    // Process floor: a block-severity violation forfeits any realized return —
402    // it is floored to the no-skill baseline (0.0) so cheating never pays, even
403    // for the (display-only) realized-return column. Eligibility logic below is
404    // unchanged; `process_ok` still independently disqualifies.
405    let process_floored = !process_ok;
406    let realized_floored_return = if process_floored {
407        0.0
408    } else {
409        raw_mean_return
410    };
411
412    let rank_eligible =
413        dsr >= cfg.dsr_bar && passed_k && process_ok && bootstrap_p < cfg.alpha && mandate_ok;
414    let composite = if rank_eligible { dsr } else { 0.0 };
415
416    CompositeScore {
417        agent_id: sub.agent_id.clone(),
418        deflated_sharpe: dsr,
419        psr,
420        passed_k,
421        process_ok,
422        bootstrap_p,
423        raw_mean_return,
424        rank_eligible,
425        composite,
426        alpha: 0.0,
427        beta: 0.0,
428        calibration_brier,
429        edge_half_life: edge_half_life_periods,
430        field_reality_check_p: 1.0,
431        max_drawdown: mdd,
432        mandate_ok,
433        turnover,
434        pareto_optimal: false,
435        step_down_significant: false,
436        confidence_weighted_return,
437        cost,
438        return_per_cost,
439        field_spa_p: 1.0,
440        field_spa_consistent_p: 1.0,
441        field_crowdedness: None,
442        in_sample_trials: sub.in_sample_trials,
443        effective_n_trials,
444        dsr_percentile,
445        selection_median_dsr,
446        selection_gap,
447        rank_ordinal: 0,
448        rolling_min_sharpe,
449        rolling_frac_positive,
450        sortino,
451        downside_deviation,
452        dsr_per_cost,
453        process_floored,
454        realized_floored_return,
455    }
456}
457
458/// Score and rank a field of agents. Eligible agents sort first (by composite
459/// desc); ineligible agents sort last (by raw return desc, for display only).
460///
461/// ```
462/// use sharpebench_core::{rank, AgentSubmission, Run, ScoreConfig, Trace};
463///
464/// let mk = |id: &str, returns: Vec<f64>, trials: u32| AgentSubmission {
465///     agent_id: id.into(),
466///     runs: vec![Run {
467///         returns,
468///         trace: Trace::default(),
469///         confidences: vec![],
470///         outcomes: vec![],
471///         cost: 0.0,
472///     }],
473///     in_sample_trials: trials,
474///     candidates: vec![],
475/// };
476///
477/// // "lucky" posts a bigger raw return but searched 500 strategies to find it.
478/// let board = rank(
479///     &[
480///         mk("skilled", vec![0.012, 0.008, 0.011, 0.009, 0.010], 1),
481///         mk("lucky", vec![0.090, -0.02, 0.001, -0.03, 0.05], 500),
482///     ],
483///     &ScoreConfig::default(),
484/// );
485///
486/// // One CompositeScore per agent; ranked by deflated Sharpe, not raw return.
487/// assert_eq!(board.len(), 2);
488/// ```
489pub fn rank(subs: &[AgentSubmission], cfg: &ScoreConfig) -> Vec<CompositeScore> {
490    // Pooled returns per agent + an equal-weight market proxy (the field average),
491    // used for performance attribution: alpha (skill) vs beta (market exposure).
492    let pooled: Vec<Vec<f64>> = subs
493        .iter()
494        .map(|s| {
495            s.runs
496                .iter()
497                .flat_map(|r| r.returns.iter().copied())
498                .collect()
499        })
500        .collect();
501    let min_len = pooled.iter().map(Vec::len).min().unwrap_or(0);
502    let n_agents = pooled.len().max(1) as f64;
503    let market: Vec<f64> = (0..min_len)
504        .map(|i| pooled.iter().map(|p| p[i]).sum::<f64>() / n_agents)
505        .collect();
506
507    let mut scores: Vec<CompositeScore> = subs
508        .iter()
509        .enumerate()
510        .map(|(idx, s)| {
511            let mut cs = score_agent(s, cfg);
512            if min_len >= 2 {
513                let (alpha, beta) = crate::attribution::alpha_beta(&pooled[idx], &market);
514                cs.alpha = alpha;
515                cs.beta = beta;
516            }
517            cs
518        })
519        .collect();
520
521    // Field-wide data-snooping significance (White's Reality Check): is the
522    // leader's edge real after accounting for how many agents were tried?
523    if min_len >= 2 {
524        let field_excess: Vec<Vec<f64>> = pooled
525            .iter()
526            .map(|p| {
527                p.iter()
528                    .take(min_len)
529                    .zip(market.iter())
530                    .map(|(a, m)| a - m)
531                    .collect()
532            })
533            .collect();
534        let rc_p = crate::significance::reality_check_pvalue(
535            &field_excess,
536            cfg.bootstrap_seed,
537            cfg.n_boot,
538            cfg.block_prob,
539        );
540        let spa_p = crate::significance::spa_pvalue(
541            &field_excess,
542            cfg.bootstrap_seed,
543            cfg.n_boot,
544            cfg.block_prob,
545        );
546        let spa_c_p = crate::significance::spa_consistent_pvalue(
547            &field_excess,
548            cfg.bootstrap_seed,
549            cfg.n_boot,
550            cfg.block_prob,
551        );
552        for cs in scores.iter_mut() {
553            cs.field_reality_check_p = rc_p;
554            cs.field_spa_p = spa_p;
555            cs.field_spa_consistent_p = spa_c_p;
556        }
557        let sd = crate::significance::step_down_significant(
558            &field_excess,
559            cfg.bootstrap_seed,
560            cfg.n_boot,
561            cfg.block_prob,
562            cfg.alpha,
563        );
564        for (cs, s) in scores.iter_mut().zip(sd) {
565            cs.step_down_significant = s;
566        }
567    }
568
569    // Crowdedness: how correlated is each agent's return stream with the rest of
570    // the field? High = riding the same factor as everyone else (a common beta
571    // that decays for the whole board at once); low/negative = diversifying skill.
572    // Reported, not gating — the field-relative sibling of decay/calibration.
573    if min_len >= 2 && pooled.len() >= 2 {
574        let aligned: Vec<&[f64]> = pooled.iter().map(|p| &p[..min_len]).collect();
575        for (idx, cs) in scores.iter_mut().enumerate() {
576            let peers: Vec<&[f64]> = aligned
577                .iter()
578                .enumerate()
579                .filter(|&(j, _)| j != idx)
580                .map(|(_, &p)| p)
581                .collect();
582            cs.field_crowdedness = crate::correlation::crowdedness(aligned[idx], &peers).mean_corr;
583        }
584    }
585
586    // Pareto front over (return↑, drawdown↓, turnover↓).
587    let pareto: Vec<bool> = (0..scores.len())
588        .map(|i| !(0..scores.len()).any(|j| j != i && dominates(&scores[j], &scores[i])))
589        .collect();
590    for (cs, p) in scores.iter_mut().zip(pareto) {
591        cs.pareto_optimal = p;
592    }
593
594    let sort_key = |s: &CompositeScore| match cfg.rank_key {
595        RankKey::DeflatedSharpe => s.composite,
596        RankKey::Alpha => {
597            if s.rank_eligible {
598                s.alpha
599            } else {
600                f64::NEG_INFINITY
601            }
602        }
603    };
604    scores.sort_by(|a, b| {
605        b.rank_eligible
606            .cmp(&a.rank_eligible)
607            .then(
608                sort_key(b)
609                    .partial_cmp(&sort_key(a))
610                    .unwrap_or(std::cmp::Ordering::Equal),
611            )
612            .then(
613                b.raw_mean_return
614                    .partial_cmp(&a.raw_mean_return)
615                    .unwrap_or(std::cmp::Ordering::Equal),
616            )
617    });
618
619    // 1-based ordinal rank among eligible agents (the scale-invariant rank mode,
620    // assigned in final sorted order). Ineligible agents keep ordinal 0.
621    let mut ord = 0usize;
622    for cs in scores.iter_mut() {
623        if cs.rank_eligible {
624            ord += 1;
625            cs.rank_ordinal = ord;
626        }
627    }
628    scores
629}
630
631#[cfg(test)]
632mod tests {
633    use super::*;
634    use crate::process::ProcessEvent;
635
636    /// Deterministic run: mean drift + a sinusoidal wiggle (no RNG → reproducible).
637    fn run(mean_ret: f64, amp: f64, n: usize) -> Run {
638        let returns = (0..n)
639            .map(|i| mean_ret + amp * (i as f64 * 0.7).sin())
640            .collect();
641        Run {
642            returns,
643            trace: Trace::default(),
644            confidences: Vec::new(),
645            outcomes: Vec::new(),
646            cost: 0.0,
647        }
648    }
649
650    fn agent(id: &str, runs: Vec<Run>) -> AgentSubmission {
651        AgentSubmission {
652            agent_id: id.to_string(),
653            runs,
654            in_sample_trials: 0,
655            candidates: Vec::new(),
656        }
657    }
658
659    #[test]
660    fn skilled_is_eligible() {
661        let s = score_agent(
662            &agent("skilled", (0..5).map(|_| run(0.002, 0.0005, 60)).collect()),
663            &ScoreConfig::default(),
664        );
665        assert!(s.rank_eligible, "skilled should be eligible: {s:?}");
666        assert!(s.passed_k && s.process_ok);
667    }
668
669    #[test]
670    fn lucky_high_return_fails_pass_k() {
671        // One spectacular run, four noisy zero-mean runs → high raw return, but
672        // it does not clear the bar on every run.
673        let mut runs = vec![run(0.02, 0.002, 60)];
674        runs.extend((0..4).map(|_| run(0.0, 0.003, 60)));
675        let s = score_agent(&agent("lucky", runs), &ScoreConfig::default());
676        assert!(!s.passed_k, "lucky should fail pass^k");
677        assert!(!s.rank_eligible, "lucky must not be rank-eligible: {s:?}");
678    }
679
680    #[test]
681    fn process_violator_is_disqualified() {
682        let mut runs: Vec<Run> = (0..5).map(|_| run(0.002, 0.0005, 60)).collect();
683        runs[0].trace.events.push(ProcessEvent::OrderPlaced {
684            risk_gate_passed: false,
685        });
686        let s = score_agent(&agent("violator", runs), &ScoreConfig::default());
687        assert!(!s.process_ok);
688        assert!(!s.rank_eligible, "a risk-gate bypass must disqualify");
689    }
690
691    /// The headline property: a lucky agent with a *higher raw return* ranks
692    /// BELOW a skilled agent, because it can't clear the luck-robust gates.
693    #[test]
694    fn deflation_demotes_luck() {
695        let skilled = agent("skilled", (0..5).map(|_| run(0.002, 0.0005, 60)).collect());
696        let lucky = {
697            let mut runs = vec![run(0.02, 0.002, 60)];
698            runs.extend((0..4).map(|_| run(0.0, 0.003, 60)));
699            agent("lucky", runs)
700        };
701        let board = rank(&[lucky.clone(), skilled.clone()], &ScoreConfig::default());
702
703        // Sanity: the lucky agent really does have the higher raw return.
704        let lucky_raw = board
705            .iter()
706            .find(|s| s.agent_id == "lucky")
707            .unwrap()
708            .raw_mean_return;
709        let skilled_raw = board
710            .iter()
711            .find(|s| s.agent_id == "skilled")
712            .unwrap()
713            .raw_mean_return;
714        assert!(
715            lucky_raw > skilled_raw,
716            "lucky raw {lucky_raw} should exceed skilled {skilled_raw}"
717        );
718
719        // Yet the board ranks the skilled agent first.
720        assert_eq!(board[0].agent_id, "skilled");
721        assert!(board[0].rank_eligible && !board[1].rank_eligible);
722    }
723
724    #[test]
725    fn confidence_weighting_rewards_conviction() {
726        // Confident on the winning run, cautious on the losing one → the
727        // conviction-weighted return beats the flat raw mean.
728        let win = Run {
729            returns: vec![0.01; 30],
730            trace: Trace::default(),
731            confidences: vec![0.9; 30],
732            outcomes: Vec::new(),
733            cost: 0.0,
734        };
735        let lose = Run {
736            returns: vec![-0.005; 30],
737            trace: Trace::default(),
738            confidences: vec![0.1; 30],
739            outcomes: Vec::new(),
740            cost: 0.0,
741        };
742        let s = score_agent(&agent("conv", vec![win, lose]), &ScoreConfig::default());
743        assert!(
744            s.confidence_weighted_return > s.raw_mean_return,
745            "cwr {} should beat raw {}",
746            s.confidence_weighted_return,
747            s.raw_mean_return
748        );
749    }
750
751    #[test]
752    fn cost_efficiency_reported_only_with_cost() {
753        let mut r = run(0.002, 0.0005, 30);
754        r.cost = 4.0;
755        let s = score_agent(&agent("paid", vec![r]), &ScoreConfig::default());
756        assert_eq!(s.cost, 4.0);
757        assert!(s.return_per_cost.is_some());
758
759        let free = score_agent(
760            &agent("free", vec![run(0.002, 0.0005, 30)]),
761            &ScoreConfig::default(),
762        );
763        assert!(free.return_per_cost.is_none());
764    }
765
766    #[test]
767    fn in_sample_search_raises_the_deflation_bar() {
768        let runs: Vec<Run> = (0..5).map(|_| run(0.002, 0.0005, 60)).collect();
769        let base = score_agent(&agent("base", runs.clone()), &ScoreConfig::default());
770        let mut over = agent("over", runs);
771        over.in_sample_trials = 5000;
772        let s = score_agent(&over, &ScoreConfig::default());
773        assert_eq!(s.effective_n_trials, 5050);
774        assert!(
775            s.deflated_sharpe <= base.deflated_sharpe,
776            "more in-sample search must not raise DSR ({} vs {})",
777            s.deflated_sharpe,
778            base.deflated_sharpe
779        );
780    }
781
782    #[test]
783    fn percentile_reported_only_with_reference() {
784        let none = score_agent(
785            &agent("p", (0..5).map(|_| run(0.002, 0.0005, 60)).collect()),
786            &ScoreConfig::default(),
787        );
788        assert!(none.dsr_percentile.is_none());
789        let cfg = ScoreConfig {
790            reference_dsr_population: vec![0.0, 0.3, 0.6, 0.9],
791            ..ScoreConfig::default()
792        };
793        let some = score_agent(
794            &agent("p", (0..5).map(|_| run(0.002, 0.0005, 60)).collect()),
795            &cfg,
796        );
797        assert!(some.dsr_percentile.is_some());
798    }
799
800    #[test]
801    fn rolling_sharpe_reported_for_long_tracks() {
802        let s = score_agent(
803            &agent("roll", (0..5).map(|_| run(0.002, 0.0005, 60)).collect()),
804            &ScoreConfig::default(),
805        );
806        // 300 pooled points ≥ 21-window → both reported, steady edge is all-positive.
807        assert!(s.rolling_min_sharpe.is_some());
808        let fp = s.rolling_frac_positive.expect("reported");
809        assert!(
810            (fp - 1.0).abs() < 1e-12,
811            "steady edge → all windows positive"
812        );
813    }
814
815    #[test]
816    fn rolling_sharpe_none_when_track_too_short() {
817        let cfg = ScoreConfig {
818            rolling_window: 100,
819            ..ScoreConfig::default()
820        };
821        let s = score_agent(&agent("short", vec![run(0.002, 0.0005, 30)]), &cfg);
822        assert!(s.rolling_min_sharpe.is_none());
823        assert!(s.rolling_frac_positive.is_none());
824    }
825
826    #[test]
827    fn dsr_per_cost_reported_only_with_cost() {
828        let mut r = run(0.002, 0.0005, 60);
829        r.cost = 5.0;
830        let paid = score_agent(&agent("paid", vec![r]), &ScoreConfig::default());
831        let dpc = paid.dsr_per_cost.expect("reported with cost");
832        assert!((dpc - paid.deflated_sharpe / 5.0).abs() < 1e-12);
833
834        let free = score_agent(
835            &agent("free", vec![run(0.002, 0.0005, 60)]),
836            &ScoreConfig::default(),
837        );
838        assert!(free.dsr_per_cost.is_none());
839    }
840
841    #[test]
842    fn process_violation_floors_realized_return() {
843        let mut runs: Vec<Run> = (0..5).map(|_| run(0.02, 0.0005, 60)).collect();
844        runs[0].trace.events.push(ProcessEvent::OrderPlaced {
845            risk_gate_passed: false,
846        });
847        let s = score_agent(&agent("cheater", runs), &ScoreConfig::default());
848        assert!(s.process_floored, "block violation must set the floor flag");
849        assert_eq!(
850            s.realized_floored_return, 0.0,
851            "floored to no-skill baseline"
852        );
853        assert!(
854            s.raw_mean_return > 0.0,
855            "raw return is preserved un-floored"
856        );
857        assert!(!s.rank_eligible, "eligibility logic intact");
858    }
859
860    #[test]
861    fn clean_process_is_not_floored() {
862        let s = score_agent(
863            &agent("clean", (0..5).map(|_| run(0.002, 0.0005, 60)).collect()),
864            &ScoreConfig::default(),
865        );
866        assert!(!s.process_floored);
867        assert_eq!(s.realized_floored_return, s.raw_mean_return);
868    }
869
870    #[test]
871    fn rank_ordinal_is_one_based_among_eligible() {
872        let skilled = agent("skilled", (0..5).map(|_| run(0.002, 0.0005, 60)).collect());
873        let lucky = {
874            let mut runs = vec![run(0.02, 0.002, 60)];
875            runs.extend((0..4).map(|_| run(0.0, 0.003, 60)));
876            agent("lucky", runs)
877        };
878        let board = rank(&[lucky, skilled], &ScoreConfig::default());
879        assert_eq!(board[0].rank_ordinal, 1, "leader is ordinal 1");
880        assert_eq!(board[1].rank_ordinal, 0, "ineligible gets ordinal 0");
881    }
882}