Skip to main content

assay_core/baseline/
report.rs

1use serde::{Deserialize, Serialize};
2use std::cmp::Reverse;
3use std::collections::HashMap;
4
5#[derive(Debug, Serialize, Deserialize)]
6pub struct HygieneReport {
7    pub schema_version: u32,
8    pub suite: String,
9    pub source: String,
10    pub score_source: String, // "final_attempt" or "all_attempts"
11    pub generated_at: String,
12    pub window: ReportWindow,
13    pub tests: Vec<TestHygiene>,
14    pub notes: Vec<String>,
15}
16
17#[derive(Debug, Serialize, Deserialize)]
18pub struct ReportWindow {
19    pub last_runs: u32,
20}
21
22#[derive(Debug, Serialize, Deserialize)]
23pub struct TestHygiene {
24    pub test_id: String,
25    pub n: u32,
26    pub rates: TestOutcomeRates,
27    pub scores: HashMap<String, MetricStats>,
28    pub top_reasons: Vec<TopReason>,
29    #[serde(default, skip_serializing_if = "Vec::is_empty")]
30    pub suggested_actions: Vec<String>,
31}
32
33#[derive(Debug, Serialize, Deserialize)]
34pub struct TestOutcomeRates {
35    pub pass: f64,
36    pub fail: f64,
37    pub warn: f64,
38    pub flaky: f64,
39    pub unstable: f64,
40    pub skipped: f64,
41}
42
43#[derive(Debug, Serialize, Deserialize)]
44pub struct MetricStats {
45    pub p10: f64,
46    pub p50: f64,
47    pub p90: f64,
48    pub std: f64,
49}
50
51#[derive(Debug, Serialize, Deserialize)]
52pub struct TopReason {
53    pub kind: String, // "skip_reason" or "error" or "failure"
54    pub value: String,
55    pub count: u32,
56}
57
58use crate::model::{TestResultRow, TestStatus};
59use crate::storage::Store;
60
61pub fn report_from_db(store: &Store, suite: &str, last_runs: u32) -> anyhow::Result<HygieneReport> {
62    let results = store.fetch_results_for_last_n_runs(suite, last_runs)?;
63
64    // Group by test_id
65    let mut test_groups: HashMap<String, Vec<&TestResultRow>> = HashMap::new();
66    for r in &results {
67        test_groups.entry(r.test_id.clone()).or_default().push(r);
68    }
69
70    let mut tests = Vec::new();
71    let mut notes = Vec::new();
72
73    for (test_id, rows) in test_groups {
74        let n = rows.len() as u32;
75        let mut counts = HashMap::new();
76        let mut reasons = HashMap::new(); // Key: (kind, value) -> Count
77        let mut scores: HashMap<String, Vec<f64>> = HashMap::new();
78
79        for r in &rows {
80            *counts.entry(r.status).or_insert(0) += 1;
81
82            // Collect reasons
83            if let Some(reason) = &r.skip_reason {
84                *reasons
85                    .entry(("skip_reason".to_string(), reason.clone()))
86                    .or_insert(0) += 1;
87            } else if r.status == TestStatus::Fail || r.status == TestStatus::Error {
88                // Primary failure reason
89                let msg = if r.message.is_empty() {
90                    "Undeclared failure".to_string()
91                } else {
92                    r.message.clone()
93                };
94                *reasons.entry(("failure".to_string(), msg)).or_insert(0) += 1;
95            }
96
97            // Extract granular metric reasons (regardless of status, often informative)
98            // Look at final result details first
99            if let Some(obj) = r.details.get("metrics").and_then(|m| m.as_object()) {
100                for (metric_name, mv) in obj {
101                    // If metric has a 'reason' string
102                    if let Some(reason) = mv.get("reason").and_then(|s| s.as_str()) {
103                        let key = format!("{}: {}", metric_name, reason);
104                        *reasons
105                            .entry(("metric_reason".to_string(), key))
106                            .or_insert(0) += 1;
107                    }
108                }
109            }
110
111            // Collect metrics scores: Use *all* attempts for robust statistics if available
112            if let Some(attempts) = &r.attempts {
113                if !attempts.is_empty() {
114                    for attempt in attempts {
115                        if let Some(obj) =
116                            attempt.details.get("metrics").and_then(|m| m.as_object())
117                        {
118                            for (metric_name, mv) in obj {
119                                if let Some(score) = mv.get("score").and_then(|s| s.as_f64()) {
120                                    scores.entry(metric_name.clone()).or_default().push(score);
121                                }
122                            }
123                        }
124                    }
125                } else {
126                    // Fallback to result details if attempts logical but empty (shouldn't happen with updated store query)
127                    // or if we decide to stick to final. Actually store.rs ensures attempts is populated.
128                    // But for safety:
129                    if let Some(obj) = r.details.get("metrics").and_then(|m| m.as_object()) {
130                        for (metric_name, mv) in obj {
131                            if let Some(score) = mv.get("score").and_then(|s| s.as_f64()) {
132                                scores.entry(metric_name.clone()).or_default().push(score);
133                            }
134                        }
135                    }
136                }
137            } else {
138                // Fallback for old records without attempts
139                if let Some(obj) = r.details.get("metrics").and_then(|m| m.as_object()) {
140                    for (metric_name, mv) in obj {
141                        if let Some(score) = mv.get("score").and_then(|s| s.as_f64()) {
142                            scores.entry(metric_name.clone()).or_default().push(score);
143                        }
144                    }
145                }
146            }
147        }
148
149        // Note: Skips are usually status=Pass/Fail but with skip_reason? Or is Skip a status?
150        // Assay core TestStatus enum: Pass, Fail, Error, Warn, Flaky.
151        // Skips are recorded but status might be the outcome of the skip? Usually Skip -> Pass in strict logic?
152        // Let's rely on skip_reason presence for "skipped" rate if status doesn't capture it.
153        // Actually, if skip_reason is present, status is what? usually Pass.
154
155        // Let's refine rates logic
156        let skipped_count = rows.iter().filter(|r| r.skip_reason.is_some()).count();
157        let rates = TestOutcomeRates {
158            pass: (*counts.get(&TestStatus::Pass).unwrap_or(&0) as f64) / n as f64,
159            fail: (*counts.get(&TestStatus::Fail).unwrap_or(&0) as f64) / n as f64,
160            warn: (*counts.get(&TestStatus::Warn).unwrap_or(&0) as f64) / n as f64,
161            flaky: (*counts.get(&TestStatus::Flaky).unwrap_or(&0) as f64) / n as f64,
162            unstable: 0.0, // Placeholder
163            skipped: skipped_count as f64 / n as f64,
164        };
165
166        // Aggregated Scores
167        let mut score_stats = HashMap::new();
168        for (metric, mut vals) in scores {
169            vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
170            let sn = vals.len() as f64;
171            if sn == 0.0 {
172                continue;
173            }
174
175            let sum: f64 = vals.iter().sum();
176            let mean = sum / sn;
177            let variance = vals.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / sn;
178            let std = variance.sqrt();
179
180            // Helper for percentile
181            let p = |q: f64| {
182                let idx = ((q * (sn - 1.0)).floor() as usize).min(vals.len() - 1);
183                vals[idx]
184            };
185
186            score_stats.insert(
187                metric,
188                MetricStats {
189                    p10: p(0.10),
190                    p50: p(0.50),
191                    p90: p(0.90),
192                    std,
193                },
194            );
195        }
196
197        // Top Reasons
198        let mut top_reasons: Vec<TopReason> = reasons
199            .into_iter()
200            .map(|((kind, value), count)| TopReason { kind, value, count })
201            .collect();
202        top_reasons.sort_by_key(|reason| Reverse(reason.count));
203        top_reasons.truncate(5);
204
205        // Suggested Actions
206        let mut actions = Vec::new();
207        if rates.skipped > 0.4 {
208            actions.push(
209                "High skip rate: Check for fingerprint drift or over-aggressive caching"
210                    .to_string(),
211            );
212        }
213        if rates.flaky > 0.1 {
214            actions.push(
215                "Flaky: Consider increasing retries or stabilizing the environment".to_string(),
216            );
217        }
218        if rates.fail > 0.2 {
219            actions.push("High failure rate: Investigate top reasons".to_string());
220        }
221        // Check for low P10 in key metrics
222        for (m, stats) in &score_stats {
223            if stats.p10 < 0.6 {
224                // Heuristic threshold
225                actions.push(format!(
226                    "Low {} scores (P10 < 0.6): Consider tuning min_score or improving prompts",
227                    m
228                ));
229            }
230        }
231
232        tests.push(TestHygiene {
233            test_id,
234            n,
235            rates,
236            scores: score_stats,
237            top_reasons,
238            suggested_actions: actions,
239        });
240    }
241
242    // Sort tests by fail rate descending (problematic first)
243    tests.sort_by(|a, b| {
244        b.rates
245            .fail
246            .partial_cmp(&a.rates.fail)
247            .unwrap_or(std::cmp::Ordering::Equal)
248    });
249
250    // Global notes
251    if tests.iter().any(|t| t.rates.skipped > 0.5) {
252        notes.push("High skip rate (>50%) detected in some tests. Check for over-aggressive fingerprinting.".to_string());
253    }
254
255    Ok(HygieneReport {
256        schema_version: 1,
257        suite: suite.to_string(),
258        source: "eval.db".to_string(),
259        score_source: "all_attempts".to_string(),
260        generated_at: chrono::Utc::now().to_rfc3339(),
261        window: ReportWindow { last_runs },
262        tests,
263        notes,
264    })
265}