Skip to main content

skilltest_core/
report.rs

1//! Run results and the JSON report. The serialized shape here is the **stable
2//! contract** the language SDKs parse. These types are the source of truth:
3//! their JSON Schemas (via `skilltest schema`, goldens in `schemas/`) are what
4//! the SDK contract tests compare their Pydantic/Zod models against.
5
6use schemars::JsonSchema;
7use serde::{Deserialize, Serialize};
8
9use crate::conversation::Transcript;
10use crate::eval::EvalOutcome;
11use crate::provider::Usage;
12use crate::skill::Finding;
13
14/// The result of running one test case on one (platform, model) pair.
15#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
16pub struct CaseRun {
17    /// The test case name.
18    pub case: String,
19    /// Absolute-ish path to the skill that was exercised.
20    pub skill: String,
21    /// The harness platform this run used.
22    pub platform: String,
23    /// The model this run used.
24    pub model: String,
25    /// True iff every eval in this run passed.
26    pub passed: bool,
27    /// Number of assistant turns produced.
28    pub turns: usize,
29    /// Per-eval outcomes, in declaration order.
30    pub evals: Vec<EvalOutcome>,
31    /// The full conversation, for debugging and deterministic mix-in checks.
32    pub transcript: Transcript,
33    /// Aggregated token/cost usage across every provider call in this run
34    /// (skill turns + simulated-user turns + judge calls). Omitted when no
35    /// usage was reported (e.g. the fake provider or a harness that doesn't
36    /// surface usage).
37    #[serde(default, skip_serializing_if = "Option::is_none")]
38    pub usage: Option<Usage>,
39}
40
41/// Aggregate pass/fail counts for a report.
42#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
43pub struct Summary {
44    /// Distinct test cases represented.
45    pub cases: usize,
46    /// Total (case × platform × model) runs.
47    pub runs: usize,
48    /// Runs that passed.
49    pub passed: usize,
50    /// Runs that failed.
51    pub failed: usize,
52    /// Aggregated token/cost usage across every run in the report. Omitted
53    /// when no run reported usage.
54    #[serde(default, skip_serializing_if = "Option::is_none")]
55    pub usage: Option<Usage>,
56}
57
58/// The top-level report for a `skilltest run` invocation.
59#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
60pub struct Report {
61    /// True iff every run passed.
62    pub passed: bool,
63    /// Aggregate counts.
64    pub summary: Summary,
65    /// Every individual run.
66    pub runs: Vec<CaseRun>,
67}
68
69impl Report {
70    /// Build a report from runs, computing the summary and overall pass.
71    #[must_use]
72    pub fn new(runs: Vec<CaseRun>) -> Self {
73        let mut case_names: Vec<&str> = runs.iter().map(|r| r.case.as_str()).collect();
74        case_names.sort_unstable();
75        case_names.dedup();
76        let passed_runs = runs.iter().filter(|r| r.passed).count();
77        let mut total_usage = Usage::default();
78        for run in &runs {
79            if let Some(u) = &run.usage {
80                total_usage.add(u);
81            }
82        }
83        let usage = (!total_usage.is_empty()).then_some(total_usage);
84        let summary = Summary {
85            cases: case_names.len(),
86            runs: runs.len(),
87            passed: passed_runs,
88            failed: runs.len() - passed_runs,
89            usage,
90        };
91        Report {
92            passed: summary.failed == 0 && !runs.is_empty(),
93            summary,
94            runs,
95        }
96    }
97
98    /// Serialize to pretty JSON (the `--format json` output).
99    ///
100    /// # Errors
101    /// [`serde_json::Error`] only if a contained value cannot serialize, which
102    /// should not happen for these types.
103    pub fn to_json(&self) -> Result<String, serde_json::Error> {
104        serde_json::to_string_pretty(self)
105    }
106
107    /// A compact, human-readable summary line per run plus a total. Quiet by
108    /// design: this is context the next reader has to parse.
109    #[must_use]
110    pub fn to_human(&self) -> String {
111        let mut out = String::new();
112        for run in &self.runs {
113            let mark = if run.passed { "PASS" } else { "FAIL" };
114            out.push_str(&format!(
115                "{mark}  {} [{}/{}]\n",
116                run.case, run.platform, run.model
117            ));
118            for eval in &run.evals {
119                if !eval.passed {
120                    out.push_str(&format!(
121                        "      - {}: {} ({})\n",
122                        eval.label,
123                        eval.detail.summary(),
124                        eval.reason
125                    ));
126                }
127            }
128        }
129        out.push_str(&format!(
130            "{}/{} runs passed\n",
131            self.summary.passed, self.summary.runs
132        ));
133        if let Some(usage) = &self.summary.usage {
134            let mut parts = Vec::new();
135            if let Some(cost) = usage.cost_usd {
136                parts.push(format!("${cost:.4}"));
137            }
138            if let (Some(i), Some(o)) = (usage.input_tokens, usage.output_tokens) {
139                parts.push(format!("{} in / {} out tokens", i, o));
140            } else {
141                if let Some(i) = usage.input_tokens {
142                    parts.push(format!("{i} input tokens"));
143                }
144                if let Some(o) = usage.output_tokens {
145                    parts.push(format!("{o} output tokens"));
146                }
147            }
148            if !parts.is_empty() {
149                out.push_str(&format!("usage: {}\n", parts.join(", ")));
150            }
151        }
152        out
153    }
154}
155
156/// One problem found while validating a skill, as serialized in the
157/// `skilltest validate --format json` output.
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
159pub struct ValidationFinding {
160    /// The skill directory the finding is about.
161    pub skill: String,
162    /// What is wrong and how to fix it.
163    pub message: String,
164}
165
166/// The top-level report for a `skilltest validate` invocation.
167#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
168pub struct ValidationReport {
169    /// True iff no findings were produced.
170    pub valid: bool,
171    /// Every finding, in discovery order.
172    pub findings: Vec<ValidationFinding>,
173}
174
175impl ValidationReport {
176    /// Build a validation report from raw findings.
177    #[must_use]
178    pub fn new(findings: &[Finding]) -> Self {
179        ValidationReport {
180            valid: findings.is_empty(),
181            findings: findings
182                .iter()
183                .map(|f| ValidationFinding {
184                    skill: f.skill.to_string_lossy().into_owned(),
185                    message: f.message.clone(),
186                })
187                .collect(),
188        }
189    }
190
191    /// Serialize to pretty JSON (the `--format json` output).
192    ///
193    /// # Errors
194    /// [`serde_json::Error`] only if a contained value cannot serialize, which
195    /// should not happen for these types.
196    pub fn to_json(&self) -> Result<String, serde_json::Error> {
197        serde_json::to_string_pretty(self)
198    }
199}