costroid_core/
bench.rs

1//! Cost-vs-quality benchmark frontier.
2//!
3//! Loads a bundled, dated, cited benchmark snapshot (`bench/benchmarks.v1.json`),
4//! computes the Pareto-efficient frontier *per benchmark*, and overlays the user's
5//! actual API-billed model mix and Costroid's own cache-correct spend. It **informs,
6//! never prescribes** (ARCHITECTURE.md §2, §9.6): it sees spend + benchmark scores but
7//! not task difficulty, so every figure is advisory and carries its sources + date.
8//!
9//! The benchmark `cost_per_task_usd` is a task-average used only for the reference
10//! frontier's cost axis — never for the user's bill. The dollar delta and the user's
11//! actual spend always use the pricing catalog / Costroid's cache-correct cost.
12
13use std::collections::{BTreeMap, BTreeSet};
14
15use chrono::{DateTime, NaiveDate, Utc};
16use rust_decimal::Decimal;
17use serde::{Deserialize, Serialize};
18
19use crate::{
20    decimal_to_u64, CoreError, CostLane, EngineSnapshot, PricingCatalog, ProviderStatus,
21    TokenTotals,
22};
23
24const BENCH_SCHEMA_VERSION: &str = "1";
25
26/// The hedge that travels with every re-pricing delta. Cost only, never quality.
27const DISCLAIMER_NOTE: &str = "~ cost-only comparison at equal token volume; not a quality claim.";
28
29/// The four token meters Costroid prices, in a stable order.
30const METERS: [&str; 4] = ["input", "output", "cache_read", "cache_write"];
31
32fn bundled_benchmarks_json() -> &'static str {
33    // Bundled inside this crate (sibling of pricing/) so `cargo package` includes it
34    // and the crate publishes standalone — exactly like pricing.v1.json.
35    include_str!("../bench/benchmarks.v1.json")
36}
37
38// ---------------------------------------------------------------------------
39// Bundled-JSON parse structs (private; Deserialize only — mirror PricingTable).
40// ---------------------------------------------------------------------------
41
42#[derive(Debug, Deserialize)]
43struct BenchmarkTable {
44    schema_version: String,
45    #[serde(default)]
46    benchmarks: Vec<Benchmark>,
47}
48
49#[derive(Debug, Deserialize)]
50struct Benchmark {
51    name: String,
52    role: String,
53    source: String,
54    as_of: String,
55    #[serde(default)]
56    harness: Option<String>,
57    cost_note: String,
58    #[serde(default)]
59    points: Vec<BenchmarkPoint>,
60}
61
62#[derive(Debug, Deserialize)]
63struct BenchmarkPoint {
64    model_id: String,
65    label: String,
66    score_pct: Decimal,
67    #[serde(default)]
68    cost_per_task_usd: Option<Decimal>,
69    #[serde(default)]
70    note: Option<String>,
71}
72
73impl BenchmarkTable {
74    fn bundled() -> Result<Self, CoreError> {
75        Self::from_json(bundled_benchmarks_json())
76    }
77
78    fn from_json(value: &str) -> Result<Self, CoreError> {
79        // CoreError already owns `From<serde_json::Error>` for the pricing path, so we
80        // cannot add a second `#[from]`; map parse errors to BenchValidation by hand.
81        let table: BenchmarkTable = serde_json::from_str(value).map_err(|err| {
82            CoreError::BenchValidation(format!("benchmark JSON parse error: {err}"))
83        })?;
84        table.validate()?;
85        Ok(table)
86    }
87
88    /// Fail-closed validation. A missing/sentinel/unparseable `as_of` is rejected so a
89    /// stale or uncited date can never ship (the permanent guard).
90    fn validate(&self) -> Result<(), CoreError> {
91        if self.schema_version != BENCH_SCHEMA_VERSION {
92            return Err(CoreError::BenchValidation(format!(
93                "unsupported schema_version {}; expected {}",
94                self.schema_version, BENCH_SCHEMA_VERSION
95            )));
96        }
97        if self.benchmarks.is_empty() {
98            return Err(CoreError::BenchValidation(
99                "bundled benchmark table has no benchmarks".to_string(),
100            ));
101        }
102        for benchmark in &self.benchmarks {
103            if benchmark.source.trim().is_empty() {
104                return Err(CoreError::BenchValidation(format!(
105                    "benchmark {} has an empty source",
106                    benchmark.name
107                )));
108            }
109            if NaiveDate::parse_from_str(benchmark.as_of.trim(), "%Y-%m-%d").is_err() {
110                return Err(CoreError::BenchValidation(format!(
111                    "benchmark {} has an invalid as_of {:?}; expected YYYY-MM-DD",
112                    benchmark.name, benchmark.as_of
113                )));
114            }
115            if benchmark.points.is_empty() {
116                return Err(CoreError::BenchValidation(format!(
117                    "benchmark {} has no points",
118                    benchmark.name
119                )));
120            }
121            let mut seen = BTreeSet::new();
122            for point in &benchmark.points {
123                if !seen.insert(point.model_id.as_str()) {
124                    return Err(CoreError::BenchValidation(format!(
125                        "benchmark {} has duplicate model_id {}",
126                        benchmark.name, point.model_id
127                    )));
128                }
129                if point.score_pct < Decimal::ZERO || point.score_pct > Decimal::from(100) {
130                    return Err(CoreError::BenchValidation(format!(
131                        "benchmark {} model {} score_pct {} is outside 0..=100",
132                        benchmark.name, point.model_id, point.score_pct
133                    )));
134                }
135                if matches!(point.cost_per_task_usd, Some(cost) if cost < Decimal::ZERO) {
136                    return Err(CoreError::BenchValidation(format!(
137                        "benchmark {} model {} has a negative cost",
138                        benchmark.name, point.model_id
139                    )));
140                }
141            }
142        }
143        Ok(())
144    }
145}
146
147// ---------------------------------------------------------------------------
148// Public output types (Serialize; the CLI renders them). Decimal is not Eq, so
149// no type carrying a Decimal derives Eq (consistent with AggregateTotals).
150// ---------------------------------------------------------------------------
151
152/// One benchmark after dominance is computed.
153#[derive(Debug, Clone, PartialEq, Serialize)]
154pub struct BenchFrontier {
155    pub name: String,
156    pub role: String,
157    pub source: String,
158    pub as_of: String,
159    pub harness: Option<String>,
160    pub cost_note: String,
161    pub points: Vec<FrontierPoint>,
162}
163
164/// A single benchmark point with its frontier standing.
165#[derive(Debug, Clone, PartialEq, Serialize)]
166pub struct FrontierPoint {
167    pub model_id: String,
168    pub label: String,
169    pub score_pct: Decimal,
170    /// `None` => no published cost (plotted by score only, cost "n/a").
171    pub cost_per_task_usd: Option<Decimal>,
172    pub standing: FrontierStanding,
173    /// `false` for a benchmark model with no pricing-catalog entry (e.g. composer-2.5)
174    /// — never a re-pricing target.
175    pub priced_in_catalog: bool,
176    /// Optional availability caveat, e.g. "Cursor subscription only - no API access".
177    pub note: Option<String>,
178}
179
180/// Where a point sits relative to its benchmark's cost-quality frontier.
181#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
182#[serde(rename_all = "snake_case")]
183pub enum FrontierStanding {
184    OnFrontier,
185    /// Dominated by another point (cheaper-or-equal AND higher-or-equal, strictly
186    /// better on one axis); carries that point's `model_id`.
187    Dominated {
188        by: String,
189    },
190    /// No published cost — cannot be placed on the cost axis; plotted by score only.
191    CostUnknown,
192}
193
194/// An API-billed model the user actually used, overlaid on the frontiers.
195#[derive(Debug, Clone, PartialEq, Serialize)]
196pub struct OverlayModel {
197    /// Resolved pricing-catalog key.
198    pub model_id: String,
199    /// The raw `x_model` string from the logs (for display).
200    pub raw_model: String,
201    /// Costroid's cache-correct actual spend across this model's API rows.
202    pub billed_cost: Decimal,
203    pub tokens: TokenTotals,
204    /// Per benchmark this model appears on. Empty => on no bundled benchmark (a gap).
205    pub appearances: Vec<OverlayAppearance>,
206    /// Equal-volume, cost-only re-pricing comparisons vs the frontier targets.
207    pub repricing: Vec<RepricingDelta>,
208}
209
210#[derive(Debug, Clone, PartialEq, Serialize)]
211pub struct OverlayAppearance {
212    pub benchmark_name: String,
213    pub score_pct: Decimal,
214    pub standing: FrontierStanding,
215}
216
217/// "~$X cheaper/more at equal token volume" — cost only, never a quality claim.
218#[derive(Debug, Clone, PartialEq, Serialize)]
219pub struct RepricingDelta {
220    pub target_model_id: String,
221    pub target_label: String,
222    /// `repriced(target) - actual_billed(this model)`, USD. Negative => target cheaper
223    /// at the same token volume. Zero (and ignorable) when `status != Computed`.
224    pub delta_usd: Decimal,
225    pub status: RepricingStatus,
226    /// Benchmarks where the target is on-frontier.
227    pub on_frontier_in: Vec<String>,
228}
229
230#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
231#[serde(rename_all = "snake_case")]
232pub enum RepricingStatus {
233    Computed,
234    /// Target lacks a catalog rate for a meter this model used → no number (never invented).
235    TargetRateGap,
236    /// Target is this same model → not a comparison.
237    SameModel,
238}
239
240/// The hedge label + the pricing date the re-pricing math used. Per-benchmark sources
241/// and dates live on each [`BenchFrontier`].
242#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
243pub struct BenchDisclaimer {
244    pub note: &'static str,
245    pub pricing_as_of: String,
246}
247
248/// The full frontier view the CLI renders.
249#[derive(Debug, Clone, PartialEq, Serialize)]
250pub struct BenchView {
251    pub generated_at: DateTime<Utc>,
252    pub frontiers: Vec<BenchFrontier>,
253    /// API-billed used models, BTreeMap-ordered. Empty when `no_api_usage`.
254    pub overlay: Vec<OverlayModel>,
255    /// True when the user has zero API-billed rows: frontiers still render as a
256    /// reference, the overlay is empty, and no delta is fabricated.
257    pub no_api_usage: bool,
258    pub disclaimer: BenchDisclaimer,
259    /// Provider detection status (mirrors the snapshot) so the standalone surface can
260    /// surface the same provider notes / no-providers guidance as now/trends.
261    pub providers: Vec<ProviderStatus>,
262}
263
264// ---------------------------------------------------------------------------
265// Computation.
266// ---------------------------------------------------------------------------
267
268/// Build the frontier view: bundled benchmarks (dominance computed) + an honest
269/// API-billed overlay drawn from the existing snapshot's `focus_rows`.
270pub fn bench_view(snapshot: &EngineSnapshot) -> Result<BenchView, CoreError> {
271    let table = BenchmarkTable::bundled()?;
272    let pricing = PricingCatalog::bundled()?;
273    let frontiers = build_frontiers(&table, &pricing);
274    let disclaimer = BenchDisclaimer {
275        note: DISCLAIMER_NOTE,
276        pricing_as_of: pricing.as_of.clone(),
277    };
278
279    // Group API-lane rows by resolved catalog key. Use the same lane classifier the
280    // now/trends summaries use so "your spend" reconciles exactly and can't drift.
281    let mut accum: BTreeMap<String, OverlayAccum> = BTreeMap::new();
282    for row in &snapshot.focus_rows {
283        if CostLane::from_access_path(&row.x_access_path) != CostLane::Api {
284            continue;
285        }
286        let key = pricing
287            .resolve_key(&row.x_model)
288            .map(str::to_string)
289            .unwrap_or_else(|| row.x_model.clone());
290        let entry = accum.entry(key).or_insert_with(|| OverlayAccum {
291            raw_model: row.x_model.clone(),
292            billed_cost: Decimal::ZERO,
293            tokens: TokenTotals::default(),
294        });
295        entry.billed_cost += row.billed_cost;
296        entry
297            .tokens
298            .add(&row.x_token_type, decimal_to_u64(row.x_consumed_tokens));
299    }
300
301    if accum.is_empty() {
302        return Ok(BenchView {
303            generated_at: snapshot.generated_at,
304            frontiers,
305            overlay: Vec::new(),
306            no_api_usage: true,
307            disclaimer,
308            providers: snapshot.providers.clone(),
309        });
310    }
311
312    let targets = repricing_targets(&frontiers);
313    let overlay = accum
314        .into_iter()
315        .map(|(model_id, acc)| {
316            let appearances = frontier_appearances(&frontiers, &model_id);
317            let repricing =
318                repricing_for(&model_id, &acc.tokens, acc.billed_cost, &targets, &pricing);
319            OverlayModel {
320                model_id,
321                raw_model: acc.raw_model,
322                billed_cost: acc.billed_cost,
323                tokens: acc.tokens,
324                appearances,
325                repricing,
326            }
327        })
328        .collect();
329
330    Ok(BenchView {
331        generated_at: snapshot.generated_at,
332        frontiers,
333        overlay,
334        no_api_usage: false,
335        disclaimer,
336        providers: snapshot.providers.clone(),
337    })
338}
339
340struct OverlayAccum {
341    raw_model: String,
342    billed_cost: Decimal,
343    tokens: TokenTotals,
344}
345
346fn build_frontiers(table: &BenchmarkTable, pricing: &PricingCatalog) -> Vec<BenchFrontier> {
347    table
348        .benchmarks
349        .iter()
350        .map(|benchmark| {
351            let points = benchmark
352                .points
353                .iter()
354                .enumerate()
355                .map(|(idx, point)| FrontierPoint {
356                    model_id: point.model_id.clone(),
357                    label: point.label.clone(),
358                    score_pct: point.score_pct,
359                    cost_per_task_usd: point.cost_per_task_usd,
360                    standing: standing_for(point, &benchmark.points, idx),
361                    priced_in_catalog: pricing.model(&point.model_id).is_some(),
362                    note: point.note.clone(),
363                })
364                .collect();
365            BenchFrontier {
366                name: benchmark.name.clone(),
367                role: benchmark.role.clone(),
368                source: benchmark.source.clone(),
369                as_of: benchmark.as_of.clone(),
370                harness: benchmark.harness.clone(),
371                cost_note: benchmark.cost_note.clone(),
372                points,
373            }
374        })
375        .collect()
376}
377
378/// Pareto dominance within one benchmark, over points with a *known* cost.
379/// `P` is dominated iff some other `Q` is cheaper-or-equal AND higher-or-equal AND
380/// strictly better on at least one axis. A point with no cost is `CostUnknown` and is
381/// excluded from the scan (it can't dominate or be dominated without a cost coordinate).
382fn standing_for(point: &BenchmarkPoint, points: &[BenchmarkPoint], idx: usize) -> FrontierStanding {
383    let Some(cost) = point.cost_per_task_usd else {
384        return FrontierStanding::CostUnknown;
385    };
386    for (other_idx, other) in points.iter().enumerate() {
387        if other_idx == idx {
388            continue;
389        }
390        let Some(other_cost) = other.cost_per_task_usd else {
391            continue;
392        };
393        let cheaper_or_equal = other_cost <= cost;
394        let higher_or_equal = other.score_pct >= point.score_pct;
395        let strictly_better = other_cost < cost || other.score_pct > point.score_pct;
396        if cheaper_or_equal && higher_or_equal && strictly_better {
397            return FrontierStanding::Dominated {
398                by: other.model_id.clone(),
399            };
400        }
401    }
402    FrontierStanding::OnFrontier
403}
404
405struct RepricingTarget {
406    model_id: String,
407    label: String,
408    on_frontier_in: Vec<String>,
409}
410
411/// The re-pricing targets: models that are on-frontier on at least one benchmark AND
412/// have a pricing-catalog entry (so composer-2.5, on CursorBench's frontier but absent
413/// from the catalog, is never a target — it would have no rates to re-price against).
414fn repricing_targets(frontiers: &[BenchFrontier]) -> Vec<RepricingTarget> {
415    let mut by_model: BTreeMap<String, RepricingTarget> = BTreeMap::new();
416    for frontier in frontiers {
417        for point in &frontier.points {
418            if point.priced_in_catalog && point.standing == FrontierStanding::OnFrontier {
419                by_model
420                    .entry(point.model_id.clone())
421                    .or_insert_with(|| RepricingTarget {
422                        model_id: point.model_id.clone(),
423                        label: point.label.clone(),
424                        on_frontier_in: Vec::new(),
425                    })
426                    .on_frontier_in
427                    .push(frontier.name.clone());
428            }
429        }
430    }
431    by_model.into_values().collect()
432}
433
434fn frontier_appearances(frontiers: &[BenchFrontier], model_id: &str) -> Vec<OverlayAppearance> {
435    frontiers
436        .iter()
437        .flat_map(|frontier| {
438            frontier
439                .points
440                .iter()
441                .filter(move |point| point.model_id == model_id)
442                .map(move |point| OverlayAppearance {
443                    benchmark_name: frontier.name.clone(),
444                    score_pct: point.score_pct,
445                    standing: point.standing.clone(),
446                })
447        })
448        .collect()
449}
450
451fn repricing_for(
452    model_id: &str,
453    tokens: &TokenTotals,
454    billed_cost: Decimal,
455    targets: &[RepricingTarget],
456    pricing: &PricingCatalog,
457) -> Vec<RepricingDelta> {
458    targets
459        .iter()
460        .map(|target| {
461            if target.model_id == model_id {
462                return RepricingDelta {
463                    target_model_id: target.model_id.clone(),
464                    target_label: target.label.clone(),
465                    delta_usd: Decimal::ZERO,
466                    status: RepricingStatus::SameModel,
467                    on_frontier_in: target.on_frontier_in.clone(),
468                };
469            }
470            match repriced_total(tokens, &target.model_id, pricing) {
471                Some(repriced) => RepricingDelta {
472                    target_model_id: target.model_id.clone(),
473                    target_label: target.label.clone(),
474                    delta_usd: repriced - billed_cost,
475                    status: RepricingStatus::Computed,
476                    on_frontier_in: target.on_frontier_in.clone(),
477                },
478                None => RepricingDelta {
479                    target_model_id: target.model_id.clone(),
480                    target_label: target.label.clone(),
481                    delta_usd: Decimal::ZERO,
482                    status: RepricingStatus::TargetRateGap,
483                    on_frontier_in: target.on_frontier_in.clone(),
484                },
485            }
486        })
487        .collect()
488}
489
490/// Re-price the user's token volume at the target's catalog rates (same per-token
491/// formula as the pricing engine). `None` if the target lacks a rate for a meter the
492/// user actually used — surfaced as a gap, never invented.
493fn repriced_total(tokens: &TokenTotals, target: &str, pricing: &PricingCatalog) -> Option<Decimal> {
494    let million = Decimal::from(1_000_000_u64);
495    let mut total = Decimal::ZERO;
496    for meter in METERS {
497        let volume = meter_volume(tokens, meter);
498        if volume == 0 {
499            continue;
500        }
501        let price = pricing.meter_price(target, meter)?;
502        total += Decimal::from(volume) * price / million;
503    }
504    Some(total)
505}
506
507fn meter_volume(tokens: &TokenTotals, meter: &str) -> u64 {
508    match meter {
509        "input" => tokens.input,
510        "output" => tokens.output,
511        "cache_read" => tokens.cache_read,
512        "cache_write" => tokens.cache_write,
513        _ => 0,
514    }
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520    use crate::focus_records_from_usage;
521    use chrono::TimeZone;
522    use costroid_providers::{AccessPath, ProviderId, UsageEvent};
523
524    fn ts() -> DateTime<Utc> {
525        // A Wednesday noon — safely inside one ISO week in any timezone, so the
526        // now-summary week filter and the (unfiltered) overlay see the same rows.
527        match Utc.with_ymd_and_hms(2026, 1, 7, 12, 0, 0) {
528            chrono::LocalResult::Single(value) => value,
529            _ => panic!("fixed test timestamp should be valid"),
530        }
531    }
532
533    fn event(model: &str, access: AccessPath, input: u64, output: u64) -> UsageEvent {
534        UsageEvent {
535            tool: ProviderId::Codex,
536            model: model.to_string(),
537            timestamp: ts(),
538            input_tokens: input,
539            output_tokens: output,
540            cache_read_tokens: 0,
541            cache_write_tokens: 0,
542            project: Some("/work/proj".to_string()),
543            access_path: access,
544        }
545    }
546
547    fn snapshot(events: &[UsageEvent]) -> EngineSnapshot {
548        let focus_rows = match focus_records_from_usage(events) {
549            Ok(rows) => rows,
550            Err(err) => panic!("events should price: {err}"),
551        };
552        EngineSnapshot {
553            generated_at: ts(),
554            usage_events: Vec::new(),
555            focus_rows,
556            limit_windows: Vec::new(),
557            providers: Vec::new(),
558        }
559    }
560
561    fn frontier<'a>(view: &'a BenchView, name: &str) -> &'a BenchFrontier {
562        match view.frontiers.iter().find(|f| f.name == name) {
563            Some(f) => f,
564            None => panic!("benchmark {name} should be present"),
565        }
566    }
567
568    fn point<'a>(frontier: &'a BenchFrontier, model_id: &str) -> &'a FrontierPoint {
569        match frontier.points.iter().find(|p| p.model_id == model_id) {
570            Some(p) => p,
571            None => panic!("point {model_id} should be present on {}", frontier.name),
572        }
573    }
574
575    fn benchmark_point(model_id: &str, score_pct: i64, cost: Option<i64>) -> BenchmarkPoint {
576        BenchmarkPoint {
577            model_id: model_id.to_string(),
578            label: model_id.to_string(),
579            score_pct: Decimal::from(score_pct),
580            cost_per_task_usd: cost.map(Decimal::from),
581            note: None,
582        }
583    }
584
585    // 1 — bundled data parses + validates, with both benchmarks and real dates.
586    #[test]
587    fn bundled_benchmarks_parse_and_validate() {
588        let table = match BenchmarkTable::bundled() {
589            Ok(table) => table,
590            Err(err) => panic!("bundled benchmarks should validate: {err}"),
591        };
592        assert_eq!(table.benchmarks.len(), 2);
593        assert_eq!(table.benchmarks[0].name, "DeepSWE");
594        assert_eq!(table.benchmarks[0].as_of, "2026-05-30");
595        assert_eq!(table.benchmarks[1].name, "CursorBench v3.1");
596        assert_eq!(table.benchmarks[1].as_of, "2026-05-18");
597    }
598
599    // 2 — the as_of guard is fail-closed: sentinel / empty / impossible date all reject.
600    #[test]
601    fn as_of_guard_is_fail_closed() {
602        let body = |as_of: &str| {
603            format!(
604                r#"{{"schema_version":"1","benchmarks":[{{"name":"X","role":"primary","source":"https://x","as_of":"{as_of}","cost_note":"n","points":[{{"model_id":"gpt-5.5","label":"g","score_pct":"70.0","cost_per_task_usd":"1.0"}}]}}]}}"#
605            )
606        };
607        for bad in ["FILL_ME", "", "2026-13-99", "May 30 2026"] {
608            match BenchmarkTable::from_json(&body(bad)) {
609                Err(CoreError::BenchValidation(_)) => {}
610                other => panic!("as_of {bad:?} should be rejected, got {other:?}"),
611            }
612        }
613        // A real date is accepted.
614        assert!(BenchmarkTable::from_json(&body("2026-05-30")).is_ok());
615    }
616
617    // 3 — frontier correctness on the seeded DeepSWE data (the DoD assertion).
618    #[test]
619    fn deepswe_opus47_is_dominated() {
620        let view = match bench_view(&snapshot(&[])) {
621            Ok(view) => view,
622            Err(err) => panic!("bench_view should build: {err}"),
623        };
624        let deepswe = frontier(&view, "DeepSWE");
625        assert_eq!(
626            point(deepswe, "claude-opus-4-7").standing,
627            FrontierStanding::Dominated {
628                by: "gpt-5.5".to_string()
629            }
630        );
631        assert_eq!(
632            point(deepswe, "gpt-5.5").standing,
633            FrontierStanding::OnFrontier
634        );
635        // sonnet-4.6 is the cheapest point at $5.52 → on-frontier (not CostUnknown).
636        assert_eq!(
637            point(deepswe, "claude-sonnet-4-6").standing,
638            FrontierStanding::OnFrontier
639        );
640        assert!(point(deepswe, "claude-sonnet-4-6")
641            .cost_per_task_usd
642            .is_some());
643    }
644
645    // 3b — a synthetic null-cost point exercises the (seed-unused) CostUnknown path.
646    #[test]
647    fn cost_unknown_point_is_score_only() {
648        let points = vec![
649            benchmark_point("gpt-5.5", 70, Some(6)),
650            benchmark_point("mystery", 40, None),
651        ];
652        assert_eq!(
653            standing_for(&points[1], &points, 1),
654            FrontierStanding::CostUnknown
655        );
656        // The priced point is unaffected by the cost-unknown one.
657        assert_eq!(
658            standing_for(&points[0], &points, 0),
659            FrontierStanding::OnFrontier
660        );
661    }
662
663    // 4 — tie handling: equal cost+score keep both on-frontier; strict beats on one axis.
664    #[test]
665    fn dominance_tie_handling() {
666        let tied = vec![
667            benchmark_point("a", 50, Some(5)),
668            benchmark_point("b", 50, Some(5)),
669        ];
670        assert_eq!(
671            standing_for(&tied[0], &tied, 0),
672            FrontierStanding::OnFrontier
673        );
674        assert_eq!(
675            standing_for(&tied[1], &tied, 1),
676            FrontierStanding::OnFrontier
677        );
678
679        // equal cost, higher score dominates the lower.
680        let same_cost = vec![
681            benchmark_point("hi", 60, Some(5)),
682            benchmark_point("lo", 50, Some(5)),
683        ];
684        assert_eq!(
685            standing_for(&same_cost[1], &same_cost, 1),
686            FrontierStanding::Dominated {
687                by: "hi".to_string()
688            }
689        );
690
691        // equal score, cheaper dominates the pricier.
692        let same_score = vec![
693            benchmark_point("cheap", 50, Some(3)),
694            benchmark_point("dear", 50, Some(8)),
695        ];
696        assert_eq!(
697            standing_for(&same_score[1], &same_score, 1),
698            FrontierStanding::Dominated {
699                by: "cheap".to_string()
700            }
701        );
702    }
703
704    // 5 — API rows only: a subscription row for the same model is excluded from spend.
705    #[test]
706    fn api_rows_only_excludes_subscription() {
707        let view = match bench_view(&snapshot(&[
708            event("gpt-5.5", AccessPath::Api, 1_000_000, 0),
709            event("gpt-5.5", AccessPath::Subscription, 1_000_000, 0),
710        ])) {
711            Ok(view) => view,
712            Err(err) => panic!("bench_view should build: {err}"),
713        };
714        assert!(!view.no_api_usage);
715        assert_eq!(view.overlay.len(), 1);
716        // Only the API row's input tokens count; gpt-5.5 input is $5.00 / 1M.
717        assert_eq!(view.overlay[0].tokens.input, 1_000_000);
718        assert_eq!(view.overlay[0].billed_cost, Decimal::new(500, 2));
719    }
720
721    // 5b — note 2: the overlay's API total reconciles with the now-summary API total.
722    #[test]
723    fn overlay_api_total_reconciles_with_now_summary() {
724        let snap = snapshot(&[
725            event("gpt-5.5", AccessPath::Api, 1_000_000, 500_000),
726            event("claude-opus-4-7", AccessPath::Api, 200_000, 0),
727            event("gpt-5.5", AccessPath::Subscription, 999_999, 0),
728        ]);
729        let view = match bench_view(&snap) {
730            Ok(view) => view,
731            Err(err) => panic!("bench_view should build: {err}"),
732        };
733        let overlay_total: Decimal = view.overlay.iter().map(|m| m.billed_cost).sum();
734
735        let now = crate::now_summary(&snap, crate::NowOptions::default());
736        let now_api_total: Decimal = now
737            .current_costs
738            .iter()
739            .filter(|c| c.lane == CostLane::Api)
740            .map(|c| c.totals.billed_cost)
741            .sum();
742
743        assert_eq!(overlay_total, now_api_total);
744    }
745
746    // 6 — no API usage: reference frontier, empty overlay, zero delta, no fabrication.
747    #[test]
748    fn no_api_usage_zero_delta_reference() {
749        let view = match bench_view(&snapshot(&[event(
750            "gpt-5.5",
751            AccessPath::Subscription,
752            1_000_000,
753            0,
754        )])) {
755            Ok(view) => view,
756            Err(err) => panic!("bench_view should build: {err}"),
757        };
758        assert!(view.no_api_usage);
759        assert!(view.overlay.is_empty());
760        assert_eq!(view.frontiers.len(), 2);
761    }
762
763    // 7 — re-pricing math: opus volume re-priced at gpt-5.5 catalog rates, exact.
764    #[test]
765    fn repricing_delta_on_known_volume() {
766        let view = match bench_view(&snapshot(&[event(
767            "claude-opus-4-7",
768            AccessPath::Api,
769            1_000_000,
770            500_000,
771        )])) {
772            Ok(view) => view,
773            Err(err) => panic!("bench_view should build: {err}"),
774        };
775        let opus = &view.overlay[0];
776        let gpt = match opus
777            .repricing
778            .iter()
779            .find(|d| d.target_model_id == "gpt-5.5")
780        {
781            Some(delta) => delta,
782            None => panic!("gpt-5.5 should be a re-pricing target"),
783        };
784        assert_eq!(gpt.status, RepricingStatus::Computed);
785        // gpt-5.5: input $5.00/1M, output $30.00/1M → 5.00 + 15.00 = $20.00 at this volume.
786        assert_eq!(gpt.delta_usd + opus.billed_cost, Decimal::new(2000, 2));
787        // opus-4-7 is itself an on-frontier target → SameModel (not a comparison).
788        let self_delta = opus
789            .repricing
790            .iter()
791            .find(|d| d.target_model_id == "claude-opus-4-7");
792        assert_eq!(
793            self_delta.map(|d| d.status),
794            Some(RepricingStatus::SameModel)
795        );
796    }
797
798    // 8 — note 3: composer-2.5 is a gap (no catalog price), never a re-pricing target,
799    // and carries its Cursor-only note.
800    #[test]
801    fn composer_is_a_gap_not_a_target() {
802        let view = match bench_view(&snapshot(&[event(
803            "claude-opus-4-7",
804            AccessPath::Api,
805            10,
806            0,
807        )])) {
808            Ok(view) => view,
809            Err(err) => panic!("bench_view should build: {err}"),
810        };
811        let cursorbench = frontier(&view, "CursorBench v3.1");
812        let composer = point(cursorbench, "composer-2.5");
813        assert!(!composer.priced_in_catalog);
814        assert_eq!(
815            composer.note.as_deref(),
816            Some("Cursor subscription only - no API access")
817        );
818        for overlay in &view.overlay {
819            assert!(
820                overlay
821                    .repricing
822                    .iter()
823                    .all(|d| d.target_model_id != "composer-2.5"),
824                "composer-2.5 must never be a re-pricing target"
825            );
826        }
827    }
828
829    // 9 — a used model on no benchmark surfaces as a gap (empty appearances), not an error.
830    #[test]
831    fn missing_model_is_a_gap() {
832        let view = match bench_view(&snapshot(&[event(
833            "claude-haiku-4-5",
834            AccessPath::Api,
835            10,
836            0,
837        )])) {
838            Ok(view) => view,
839            Err(err) => panic!("bench_view should build: {err}"),
840        };
841        let haiku = match view
842            .overlay
843            .iter()
844            .find(|m| m.model_id == "claude-haiku-4-5")
845        {
846            Some(model) => model,
847            None => panic!("haiku should be in the overlay"),
848        };
849        assert!(haiku.appearances.is_empty());
850    }
851
852    // 10 — a target missing a rate for a meter the user used is a gap, no number.
853    #[test]
854    fn repricing_skips_target_rate_gap() {
855        // gpt-5.5 (OpenAI) has no cache_write rate; a model that used cache_write
856        // cannot be re-priced against it → TargetRateGap, never a fabricated number.
857        let mut cache_write_event = event("claude-opus-4-7", AccessPath::Api, 0, 0);
858        cache_write_event.cache_write_tokens = 1_000_000;
859        let view = match bench_view(&snapshot(&[cache_write_event])) {
860            Ok(view) => view,
861            Err(err) => panic!("bench_view should build: {err}"),
862        };
863        let opus = &view.overlay[0];
864        let gpt = match opus
865            .repricing
866            .iter()
867            .find(|d| d.target_model_id == "gpt-5.5")
868        {
869            Some(delta) => delta,
870            None => panic!("gpt-5.5 should appear as a target"),
871        };
872        assert_eq!(gpt.status, RepricingStatus::TargetRateGap);
873    }
874
875    // 11 — the disclaimer carries the cost-only hedge and the pricing date.
876    #[test]
877    fn disclaimer_carries_hedge_and_pricing_date() {
878        let view = match bench_view(&snapshot(&[])) {
879            Ok(view) => view,
880            Err(err) => panic!("bench_view should build: {err}"),
881        };
882        assert!(view.disclaimer.note.starts_with('~'));
883        assert!(view.disclaimer.note.contains("not a quality claim"));
884        assert!(!view.disclaimer.pricing_as_of.is_empty());
885    }
886}
costroid_core/bench.rs

costroid_core/
bench.rs