Skip to main content

assay_core/report/
summary.rs

1//! summary.json output per SPEC-PR-Gate-Outputs-v1
2//!
3//! This module defines the machine-readable summary format for `assay ci` and `assay run`.
4//! The summary includes schema versioning, exit/reason codes, provenance, and results.
5
6use serde::{Deserialize, Serialize};
7use std::path::Path;
8
9/// Serde helpers: serialize Option<u64> as JSON string or null to avoid precision loss (u64 > 2^53 in JS).
10mod serde_seed {
11    use serde::{Deserialize, Deserializer, Serializer};
12
13    pub fn serialize_opt_u64_as_str<S>(v: &Option<u64>, s: S) -> Result<S::Ok, S::Error>
14    where
15        S: Serializer,
16    {
17        match v {
18            Some(n) => s.serialize_str(&n.to_string()),
19            None => s.serialize_none(),
20        }
21    }
22
23    pub fn deserialize_opt_u64_from_str<'de, D>(d: D) -> Result<Option<u64>, D::Error>
24    where
25        D: Deserializer<'de>,
26    {
27        let opt: Option<serde_json::Value> = Option::deserialize(d)?;
28        match opt {
29            None | Some(serde_json::Value::Null) => Ok(None),
30            Some(serde_json::Value::String(s)) => {
31                let n = s.parse::<u64>().map_err(serde::de::Error::custom)?;
32                Ok(Some(n))
33            }
34            Some(serde_json::Value::Number(num)) => {
35                // Legacy only; write path always emits string.
36                let n = num
37                    .as_u64()
38                    .ok_or_else(|| serde::de::Error::custom("seed number must be u64"))?;
39                Ok(Some(n))
40            }
41            Some(other) => Err(serde::de::Error::custom(format!(
42                "seed must be string or null, got: {other}"
43            ))),
44        }
45    }
46}
47
48/// Current schema version for summary.json
49pub const SCHEMA_VERSION: u32 = 1;
50
51/// Reason code registry version (stable for downstream branching).
52/// Downstream MUST branch on (reason_code_version, reason_code) rather than exit code.
53pub const REASON_CODE_VERSION: u32 = 1;
54
55/// Seed version for deterministic replay (E7.2). Same philosophy as reason_code_version.
56pub const SEED_VERSION: u32 = 1;
57
58/// Machine-readable summary for the PR gate
59///
60/// See: SPEC-PR-Gate-Outputs-v1.md for the full contract.
61/// Downstream MUST branch on (reason_code_version, reason_code) rather than exit code.
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct Summary {
64    /// Schema version for compatibility detection
65    pub schema_version: u32,
66
67    /// Version of the reason code registry. MUST be 1 for Outputs-v1. Downstream MUST branch on (reason_code_version, reason_code) rather than exit code.
68    pub reason_code_version: u32,
69
70    /// Exit code: 0=pass, 1=test failure, 2=config error, 3=infra error
71    pub exit_code: i32,
72
73    /// Stable machine-readable reason code (e.g., "E_TRACE_NOT_FOUND")
74    pub reason_code: String,
75
76    /// Human-readable message describing the outcome
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub message: Option<String>,
79
80    /// Suggested next step when exit_code != 0
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub next_step: Option<String>,
83
84    /// Provenance information for auditability
85    pub provenance: Provenance,
86
87    /// Results summary
88    #[serde(skip_serializing_if = "Option::is_none")]
89    pub results: Option<ResultsSummary>,
90
91    /// Performance metrics (optional)
92    #[serde(skip_serializing_if = "Option::is_none")]
93    pub performance: Option<PerformanceMetrics>,
94
95    /// Seeds for deterministic replay (E7.2). Always present for schema stability (order_seed/judge_seed null when unknown).
96    pub seeds: Seeds,
97
98    /// Judge reliability metrics (E7.3). Present when run had judge evaluations.
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub judge_metrics: Option<JudgeMetrics>,
101
102    /// SARIF truncation (E2.3). Present when SARIF was truncated (N results omitted).
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub sarif: Option<SarifOutputInfo>,
105}
106
107/// SARIF output metadata (E2.3). Written when SARIF was truncated.
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct SarifOutputInfo {
110    /// Number of results omitted from SARIF due to max_results limit.
111    pub omitted: u64,
112}
113
114/// Seeds used in the run (replay determinism). Always present in Summary; order_seed/judge_seed encoded as string or null to avoid JSON number precision loss (u64 > 2^53).
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct Seeds {
117    /// Version of the seed schema; consumers MUST branch on this.
118    pub seed_version: u32,
119    /// Seed used for test execution order (shuffle). Serialized as decimal string or null (schema stability + consumer-safe).
120    #[serde(
121        serialize_with = "serde_seed::serialize_opt_u64_as_str",
122        deserialize_with = "serde_seed::deserialize_opt_u64_from_str"
123    )]
124    pub order_seed: Option<u64>,
125    /// Seed used for judge randomization (per-test seed derived from suite seed when present). MAY be null until implemented; consumers MUST handle null.
126    #[serde(
127        serialize_with = "serde_seed::serialize_opt_u64_as_str",
128        deserialize_with = "serde_seed::deserialize_opt_u64_from_str"
129    )]
130    pub judge_seed: Option<u64>,
131    /// Optional: determinism for telemetry sampling (future use).
132    #[serde(skip_serializing_if = "Option::is_none")]
133    pub sampling_seed: Option<u64>,
134}
135
136impl Default for Seeds {
137    fn default() -> Self {
138        Self {
139            seed_version: SEED_VERSION,
140            order_seed: None,
141            judge_seed: None,
142            sampling_seed: None,
143        }
144    }
145}
146
147/// Judge reliability metrics (low cardinality, E8-consistent)
148#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct JudgeMetrics {
150    /// Fraction of judge evaluations that returned Abstain (uncertain).
151    #[serde(skip_serializing_if = "Option::is_none")]
152    pub abstain_rate: Option<f64>,
153    /// Fraction of evaluations where order was swapped and outcome differed (flip).
154    #[serde(skip_serializing_if = "Option::is_none")]
155    pub flip_rate: Option<f64>,
156    /// Fraction of evaluations where all samples agreed (consensus).
157    #[serde(skip_serializing_if = "Option::is_none")]
158    pub consensus_rate: Option<f64>,
159    /// Count of runs where judge was unavailable (infra/transport); do not count toward abstain_rate.
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub unavailable_count: Option<u32>,
162}
163
164/// Provenance fields for artifact auditability (ADR-019 P0.4)
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct Provenance {
167    /// Assay CLI version that produced this run
168    pub assay_version: String,
169
170    /// Verification mode: "enabled" or "disabled"
171    pub verify_mode: String,
172
173    /// Digest of policy/pack used
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub policy_pack_digest: Option<String>,
176
177    /// Digest of baseline used for comparison
178    #[serde(skip_serializing_if = "Option::is_none")]
179    pub baseline_digest: Option<String>,
180
181    /// Digest of trace input (optional for privacy)
182    #[serde(skip_serializing_if = "Option::is_none")]
183    pub trace_digest: Option<String>,
184
185    /// True when output is from replaying a bundle.
186    #[serde(skip_serializing_if = "Option::is_none")]
187    pub replay: Option<bool>,
188
189    /// SHA256 digest of replay bundle archive.
190    #[serde(skip_serializing_if = "Option::is_none")]
191    pub bundle_digest: Option<String>,
192
193    /// Replay mode: offline|live.
194    #[serde(skip_serializing_if = "Option::is_none")]
195    pub replay_mode: Option<String>,
196
197    /// Optional original run id from source run.
198    #[serde(skip_serializing_if = "Option::is_none")]
199    pub source_run_id: Option<String>,
200}
201
202/// Test results summary
203#[derive(Debug, Clone, Serialize, Deserialize)]
204pub struct ResultsSummary {
205    /// Count of tests passed
206    pub passed: usize,
207
208    /// Count of tests failed
209    pub failed: usize,
210
211    /// Count of tests with warnings/flaky
212    #[serde(skip_serializing_if = "Option::is_none")]
213    pub warned: Option<usize>,
214
215    /// Count of tests skipped (e.g., cache hit)
216    #[serde(skip_serializing_if = "Option::is_none")]
217    pub skipped: Option<usize>,
218
219    /// Total test count
220    pub total: usize,
221}
222
223/// Performance metrics for observability
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct PerformanceMetrics {
226    /// Total run duration in milliseconds
227    pub total_duration_ms: u64,
228
229    /// Evidence verify duration in milliseconds (Wave C trigger surface).
230    #[serde(skip_serializing_if = "Option::is_none")]
231    pub verify_ms: Option<u64>,
232
233    /// Evidence lint duration in milliseconds (Wave C trigger surface).
234    #[serde(skip_serializing_if = "Option::is_none")]
235    pub lint_ms: Option<u64>,
236
237    /// Runner clone overhead estimate in milliseconds (Wave C trigger surface).
238    #[serde(skip_serializing_if = "Option::is_none")]
239    pub runner_clone_ms: Option<u64>,
240
241    /// Number of runner clone operations performed during suite execution.
242    #[serde(skip_serializing_if = "Option::is_none")]
243    pub runner_clone_count: Option<u64>,
244
245    /// Profile store phase duration in milliseconds (Wave C trigger surface).
246    #[serde(skip_serializing_if = "Option::is_none")]
247    pub profile_store_ms: Option<u64>,
248
249    /// Run-id tracker memory footprint estimate in bytes (Wave C trigger surface).
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub run_id_memory_bytes: Option<u64>,
252
253    /// Cache hit rate (0.0 to 1.0)
254    #[serde(skip_serializing_if = "Option::is_none")]
255    pub cache_hit_rate: Option<f64>,
256
257    /// Slowest tests (up to 5)
258    #[serde(skip_serializing_if = "Option::is_none")]
259    pub slowest_tests: Option<Vec<SlowestTest>>,
260
261    /// Phase timings (optional)
262    #[serde(skip_serializing_if = "Option::is_none")]
263    pub phase_timings: Option<PhaseTimings>,
264}
265
266/// Entry for slowest tests list
267#[derive(Debug, Clone, Serialize, Deserialize)]
268pub struct SlowestTest {
269    pub test_id: String,
270    pub duration_ms: u64,
271}
272
273/// Phase timing breakdown
274#[derive(Debug, Clone, Serialize, Deserialize)]
275pub struct PhaseTimings {
276    #[serde(skip_serializing_if = "Option::is_none")]
277    pub ingest_ms: Option<u64>,
278    #[serde(skip_serializing_if = "Option::is_none")]
279    pub eval_ms: Option<u64>,
280    #[serde(skip_serializing_if = "Option::is_none")]
281    pub judge_ms: Option<u64>,
282    #[serde(skip_serializing_if = "Option::is_none")]
283    pub report_ms: Option<u64>,
284}
285
286impl Provenance {
287    /// Create a new Provenance with version and verify mode
288    fn new(assay_version: &str, verify_enabled: bool) -> Self {
289        Self {
290            assay_version: assay_version.to_string(),
291            verify_mode: if verify_enabled {
292                "enabled".to_string()
293            } else {
294                "disabled".to_string()
295            },
296            policy_pack_digest: None,
297            baseline_digest: None,
298            trace_digest: None,
299            replay: None,
300            bundle_digest: None,
301            replay_mode: None,
302            source_run_id: None,
303        }
304    }
305}
306
307impl Summary {
308    /// Create a success summary
309    pub fn success(assay_version: &str, verify_enabled: bool) -> Self {
310        Self {
311            schema_version: SCHEMA_VERSION,
312            reason_code_version: REASON_CODE_VERSION,
313            exit_code: 0,
314            reason_code: String::new(),
315            message: Some("All tests passed".to_string()),
316            next_step: None,
317            provenance: Provenance::new(assay_version, verify_enabled),
318            results: None,
319            performance: None,
320            seeds: Seeds::default(),
321            judge_metrics: None,
322            sarif: None,
323        }
324    }
325
326    /// Create a failure summary with reason code and next step
327    pub fn failure(
328        exit_code: i32,
329        reason_code: &str,
330        message: &str,
331        next_step: &str,
332        assay_version: &str,
333        verify_enabled: bool,
334    ) -> Self {
335        Self {
336            schema_version: SCHEMA_VERSION,
337            reason_code_version: REASON_CODE_VERSION,
338            exit_code,
339            reason_code: reason_code.to_string(),
340            message: Some(message.to_string()),
341            next_step: Some(next_step.to_string()),
342            provenance: Provenance::new(assay_version, verify_enabled),
343            results: None,
344            performance: None,
345            seeds: Seeds::default(),
346            judge_metrics: None,
347            sarif: None,
348        }
349    }
350
351    /// Set results summary
352    pub fn with_results(mut self, passed: usize, failed: usize, total: usize) -> Self {
353        self.results = Some(ResultsSummary {
354            passed,
355            failed,
356            warned: None,
357            skipped: None,
358            total,
359        });
360        self
361    }
362
363    /// Set performance metrics
364    pub fn with_duration(mut self, duration_ms: u64) -> Self {
365        self.performance = Some(PerformanceMetrics {
366            total_duration_ms: duration_ms,
367            verify_ms: None,
368            lint_ms: None,
369            runner_clone_ms: None,
370            runner_clone_count: None,
371            profile_store_ms: None,
372            run_id_memory_bytes: None,
373            cache_hit_rate: None,
374            slowest_tests: None,
375            phase_timings: None,
376        });
377        self
378    }
379
380    /// Set full performance metrics payload.
381    pub fn with_performance(mut self, performance: PerformanceMetrics) -> Self {
382        self.performance = Some(performance);
383        self
384    }
385
386    /// Set provenance digests
387    pub fn with_digests(
388        mut self,
389        policy_digest: Option<String>,
390        baseline_digest: Option<String>,
391        trace_digest: Option<String>,
392    ) -> Self {
393        self.provenance.policy_pack_digest = policy_digest;
394        self.provenance.baseline_digest = baseline_digest;
395        self.provenance.trace_digest = trace_digest;
396        self
397    }
398
399    /// Set replay provenance fields (E9c).
400    pub fn with_replay_provenance(
401        mut self,
402        bundle_digest: String,
403        replay_mode: &str,
404        source_run_id: Option<String>,
405    ) -> Self {
406        self.provenance.replay = Some(true);
407        self.provenance.bundle_digest = Some(bundle_digest);
408        self.provenance.replay_mode = Some(replay_mode.to_string());
409        self.provenance.source_run_id = source_run_id;
410        self
411    }
412
413    /// Set seeds for replay determinism (E7.2). Keys always present in JSON (string or null).
414    pub fn with_seeds(mut self, order_seed: Option<u64>, judge_seed: Option<u64>) -> Self {
415        self.seeds.order_seed = order_seed;
416        self.seeds.judge_seed = judge_seed;
417        self
418    }
419
420    /// Set judge reliability metrics (E7.3)
421    pub fn with_judge_metrics(mut self, metrics: JudgeMetrics) -> Self {
422        self.judge_metrics = Some(metrics);
423        self
424    }
425
426    /// Set SARIF truncation info (E2.3). Call when omitted_count > 0.
427    pub fn with_sarif_omitted(mut self, omitted: u64) -> Self {
428        if omitted > 0 {
429            self.sarif = Some(SarifOutputInfo { omitted });
430        }
431        self
432    }
433}
434
435/// Compute judge reliability metrics from run results (E7.3).
436/// Returns None if no results have judge details.
437/// One test can contribute multiple evaluations (one per metric name, e.g. faithfulness + relevance); rates are per-evaluation.
438pub fn judge_metrics_from_results(results: &[crate::model::TestResultRow]) -> Option<JudgeMetrics> {
439    use crate::model::TestStatus;
440
441    let mut total_judge = 0u32;
442    let mut abstain_count = 0u32;
443    let mut consensus_count = 0u32;
444    let mut flip_count = 0u32;
445
446    for r in results {
447        let Some(metrics) = r.details.get("metrics").and_then(|m| m.as_object()) else {
448            continue;
449        };
450        for (_name, metric_val) in metrics {
451            let Some(details) = metric_val.get("details") else {
452                continue;
453            };
454            let verdict = details.get("verdict").and_then(|v| v.as_str());
455            let agreement = details.get("agreement").and_then(|v| v.as_f64());
456            let swapped = details
457                .get("swapped")
458                .and_then(|v| v.as_bool())
459                .unwrap_or(false);
460
461            if verdict.is_none() && agreement.is_none() {
462                continue;
463            }
464            total_judge += 1;
465
466            if verdict == Some("Abstain") {
467                abstain_count += 1;
468            }
469            if let Some(a) = agreement {
470                if a == 0.0 || a == 1.0 {
471                    consensus_count += 1;
472                }
473                // flip_rate: heuristic proxy for "order was swapped and outcome differed".
474                // We do not store the counterfactual verdict, so we use: swapped + non-unanimous
475                // (0 < agreement < 1). This does NOT guarantee the verdict actually flipped;
476                // it indicates order may have affected outcome. Strict definition would require
477                // the judge to record whether pass/fail differed under the other ordering.
478                if swapped && a > 0.0 && a < 1.0 {
479                    flip_count += 1;
480                }
481            }
482        }
483    }
484
485    if total_judge == 0 {
486        return None;
487    }
488
489    let total = total_judge as f64;
490    Some(JudgeMetrics {
491        abstain_rate: Some(abstain_count as f64 / total),
492        flip_rate: Some(flip_count as f64 / total),
493        consensus_rate: Some(consensus_count as f64 / total),
494        unavailable_count: Some(
495            results
496                .iter()
497                .filter(|r| matches!(r.status, TestStatus::Error))
498                .filter(|r| {
499                    let m = r.message.to_lowercase();
500                    m.contains("timeout")
501                        || m.contains("500")
502                        || m.contains("502")
503                        || m.contains("503")
504                        || m.contains("504")
505                        || m.contains("rate limit")
506                        || m.contains("network")
507                })
508                .count() as u32,
509        ),
510    })
511}
512
513/// Write summary.json to file
514pub fn write_summary(summary: &Summary, out: &Path) -> anyhow::Result<()> {
515    let json = serde_json::to_string_pretty(summary)?;
516    std::fs::write(out, json)?;
517    Ok(())
518}
519
520#[cfg(test)]
521mod tests {
522    use super::*;
523
524    #[test]
525    fn test_success_summary() {
526        let summary = Summary::success("2.12.0", true)
527            .with_results(10, 0, 10)
528            .with_duration(1234);
529
530        assert_eq!(summary.schema_version, 1);
531        assert_eq!(summary.reason_code_version, 1);
532        assert_eq!(summary.exit_code, 0);
533        assert_eq!(summary.reason_code, "");
534        assert_eq!(summary.provenance.verify_mode, "enabled");
535    }
536
537    #[test]
538    fn test_failure_summary() {
539        let summary = Summary::failure(
540            2,
541            "E_TRACE_NOT_FOUND",
542            "Trace file not found: traces/ci.jsonl",
543            "Run: assay doctor --config ci-eval.yaml",
544            "2.12.0",
545            true,
546        );
547
548        assert_eq!(summary.reason_code_version, 1);
549        assert_eq!(summary.exit_code, 2);
550        assert_eq!(summary.reason_code, "E_TRACE_NOT_FOUND");
551        assert!(summary.next_step.is_some());
552    }
553
554    #[test]
555    fn test_summary_serialization() {
556        let summary = Summary::success("2.12.0", true).with_results(5, 2, 7);
557
558        let json = serde_json::to_string_pretty(&summary).unwrap();
559        assert!(json.contains("\"schema_version\": 1"));
560        assert!(json.contains("\"reason_code_version\": 1"));
561        assert!(json.contains("\"assay_version\": \"2.12.0\""));
562
563        let v: serde_json::Value = serde_json::from_str(&json).unwrap();
564        assert_eq!(
565            v["reason_code_version"], 1,
566            "reason_code_version must be present and integer"
567        );
568
569        // E7.2: seeds always present; order_seed/judge_seed keys exist (string or null)
570        assert_eq!(v["seeds"]["seed_version"], 1);
571        assert!(
572            v["seeds"].get("order_seed").is_some(),
573            "order_seed key must exist"
574        );
575        assert!(
576            v["seeds"].get("judge_seed").is_some(),
577            "judge_seed key must exist"
578        );
579        assert!(v["seeds"]["order_seed"].is_null());
580        assert!(v["seeds"]["judge_seed"].is_null());
581    }
582
583    #[test]
584    fn test_seeds_serialize_as_string() {
585        let summary = Summary::success("2.12.0", true)
586            .with_results(1, 0, 1)
587            .with_seeds(Some(17390767342376325021), None);
588
589        let json = serde_json::to_string(&summary).unwrap();
590        let v: serde_json::Value = serde_json::from_str(&json).unwrap();
591        assert!(
592            v["seeds"]["order_seed"].is_string(),
593            "order_seed must be string to avoid precision loss"
594        );
595        assert_eq!(
596            v["seeds"]["order_seed"].as_str(),
597            Some("17390767342376325021")
598        );
599        assert!(v["seeds"]["judge_seed"].is_null());
600    }
601
602    #[test]
603    fn test_judge_metrics_abstain_not_counted_as_unavailable() {
604        use crate::model::{TestResultRow, TestStatus};
605
606        // Rows with verdict Abstain (uncertain) must NOT increment unavailable_count.
607        // unavailable_count is only for Error status + infra message (timeout/5xx/rate limit/network).
608        let results = vec![TestResultRow {
609            test_id: "t1".into(),
610            status: TestStatus::Pass,
611            score: Some(0.5),
612            cached: false,
613            message: String::new(),
614            details: serde_json::json!({
615                "metrics": {
616                    "m1": { "details": { "verdict": "Abstain", "agreement": 0.5 } }
617                }
618            }),
619            duration_ms: None,
620            fingerprint: None,
621            skip_reason: None,
622            attempts: None,
623            error_policy_applied: None,
624        }];
625        let metrics = judge_metrics_from_results(&results).unwrap();
626        assert_eq!(metrics.abstain_rate, Some(1.0));
627        assert_eq!(metrics.unavailable_count, Some(0));
628    }
629}