assay_core/baseline/
report.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4#[derive(Debug, Serialize, Deserialize)]
5pub struct HygieneReport {
6    pub schema_version: u32,
7    pub suite: String,
8    pub source: String,
9    pub score_source: String, // "final_attempt" or "all_attempts"
10    pub generated_at: String,
11    pub window: ReportWindow,
12    pub tests: Vec<TestHygiene>,
13    pub notes: Vec<String>,
14}
15
16#[derive(Debug, Serialize, Deserialize)]
17pub struct ReportWindow {
18    pub last_runs: u32,
19}
20
21#[derive(Debug, Serialize, Deserialize)]
22pub struct TestHygiene {
23    pub test_id: String,
24    pub n: u32,
25    pub rates: TestOutcomeRates,
26    pub scores: HashMap<String, MetricStats>,
27    pub top_reasons: Vec<TopReason>,
28    #[serde(default, skip_serializing_if = "Vec::is_empty")]
29    pub suggested_actions: Vec<String>,
30}
31
32#[derive(Debug, Serialize, Deserialize)]
33pub struct TestOutcomeRates {
34    pub pass: f64,
35    pub fail: f64,
36    pub warn: f64,
37    pub flaky: f64,
38    pub unstable: f64,
39    pub skipped: f64,
40}
41
42#[derive(Debug, Serialize, Deserialize)]
43pub struct MetricStats {
44    pub p10: f64,
45    pub p50: f64,
46    pub p90: f64,
47    pub std: f64,
48}
49
50#[derive(Debug, Serialize, Deserialize)]
51pub struct TopReason {
52    pub kind: String, // "skip_reason" or "error" or "failure"
53    pub value: String,
54    pub count: u32,
55}
56
57use crate::model::{TestResultRow, TestStatus};
58use crate::storage::Store;
59
60pub fn report_from_db(store: &Store, suite: &str, last_runs: u32) -> anyhow::Result<HygieneReport> {
61    let results = store.fetch_results_for_last_n_runs(suite, last_runs)?;
62
63    // Group by test_id
64    let mut test_groups: HashMap<String, Vec<&TestResultRow>> = HashMap::new();
65    for r in &results {
66        test_groups.entry(r.test_id.clone()).or_default().push(r);
67    }
68
69    let mut tests = Vec::new();
70    let mut notes = Vec::new();
71
72    for (test_id, rows) in test_groups {
73        let n = rows.len() as u32;
74        let mut counts = HashMap::new();
75        let mut reasons = HashMap::new(); // Key: (kind, value) -> Count
76        let mut scores: HashMap<String, Vec<f64>> = HashMap::new();
77
78        for r in &rows {
79            *counts.entry(r.status.clone()).or_insert(0) += 1;
80
81            // Collect reasons
82            if let Some(reason) = &r.skip_reason {
83                *reasons
84                    .entry(("skip_reason".to_string(), reason.clone()))
85                    .or_insert(0) += 1;
86            } else if r.status == TestStatus::Fail || r.status == TestStatus::Error {
87                // Primary failure reason
88                let msg = if r.message.is_empty() {
89                    "Undeclared failure".to_string()
90                } else {
91                    r.message.clone()
92                };
93                *reasons.entry(("failure".to_string(), msg)).or_insert(0) += 1;
94            }
95
96            // Extract granular metric reasons (regardless of status, often informative)
97            // Look at final result details first
98            if let Some(obj) = r.details.get("metrics").and_then(|m| m.as_object()) {
99                for (metric_name, mv) in obj {
100                    // If metric has a 'reason' string
101                    if let Some(reason) = mv.get("reason").and_then(|s| s.as_str()) {
102                        let key = format!("{}: {}", metric_name, reason);
103                        *reasons
104                            .entry(("metric_reason".to_string(), key))
105                            .or_insert(0) += 1;
106                    }
107                }
108            }
109
110            // Collect metrics scores: Use *all* attempts for robust statistics if available
111            if let Some(attempts) = &r.attempts {
112                if !attempts.is_empty() {
113                    for attempt in attempts {
114                        if let Some(obj) =
115                            attempt.details.get("metrics").and_then(|m| m.as_object())
116                        {
117                            for (metric_name, mv) in obj {
118                                if let Some(score) = mv.get("score").and_then(|s| s.as_f64()) {
119                                    scores.entry(metric_name.clone()).or_default().push(score);
120                                }
121                            }
122                        }
123                    }
124                } else {
125                    // Fallback to result details if attempts logical but empty (shouldn't happen with updated store query)
126                    // or if we decide to stick to final. Actually store.rs ensures attempts is populated.
127                    // But for safety:
128                    if let Some(obj) = r.details.get("metrics").and_then(|m| m.as_object()) {
129                        for (metric_name, mv) in obj {
130                            if let Some(score) = mv.get("score").and_then(|s| s.as_f64()) {
131                                scores.entry(metric_name.clone()).or_default().push(score);
132                            }
133                        }
134                    }
135                }
136            } else {
137                // Fallback for old records without attempts
138                if let Some(obj) = r.details.get("metrics").and_then(|m| m.as_object()) {
139                    for (metric_name, mv) in obj {
140                        if let Some(score) = mv.get("score").and_then(|s| s.as_f64()) {
141                            scores.entry(metric_name.clone()).or_default().push(score);
142                        }
143                    }
144                }
145            }
146        }
147
148        // Note: Skips are usually status=Pass/Fail but with skip_reason? Or is Skip a status?
149        // Assay core TestStatus enum: Pass, Fail, Error, Warn, Flaky.
150        // Skips are recorded but status might be the outcome of the skip? Usually Skip -> Pass in strict logic?
151        // Let's rely on skip_reason presence for "skipped" rate if status doesn't capture it.
152        // Actually, if skip_reason is present, status is what? usually Pass.
153
154        // Let's refine rates logic
155        let skipped_count = rows.iter().filter(|r| r.skip_reason.is_some()).count();
156        let rates = TestOutcomeRates {
157            pass: (*counts.get(&TestStatus::Pass).unwrap_or(&0) as f64) / n as f64,
158            fail: (*counts.get(&TestStatus::Fail).unwrap_or(&0) as f64) / n as f64,
159            warn: (*counts.get(&TestStatus::Warn).unwrap_or(&0) as f64) / n as f64,
160            flaky: (*counts.get(&TestStatus::Flaky).unwrap_or(&0) as f64) / n as f64,
161            unstable: 0.0, // Placeholder
162            skipped: skipped_count as f64 / n as f64,
163        };
164
165        // Aggregated Scores
166        let mut score_stats = HashMap::new();
167        for (metric, mut vals) in scores {
168            vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
169            let sn = vals.len() as f64;
170            if sn == 0.0 {
171                continue;
172            }
173
174            let sum: f64 = vals.iter().sum();
175            let mean = sum / sn;
176            let variance = vals.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / sn;
177            let std = variance.sqrt();
178
179            // Helper for percentile
180            let p = |q: f64| {
181                let idx = ((q * (sn - 1.0)).floor() as usize).min(vals.len() - 1);
182                vals[idx]
183            };
184
185            score_stats.insert(
186                metric,
187                MetricStats {
188                    p10: p(0.10),
189                    p50: p(0.50),
190                    p90: p(0.90),
191                    std,
192                },
193            );
194        }
195
196        // Top Reasons
197        let mut top_reasons: Vec<TopReason> = reasons
198            .into_iter()
199            .map(|((kind, value), count)| TopReason { kind, value, count })
200            .collect();
201        top_reasons.sort_by(|a, b| b.count.cmp(&a.count));
202        top_reasons.truncate(5);
203
204        // Suggested Actions
205        let mut actions = Vec::new();
206        if rates.skipped > 0.4 {
207            actions.push(
208                "High skip rate: Check for fingerprint drift or over-aggressive caching"
209                    .to_string(),
210            );
211        }
212        if rates.flaky > 0.1 {
213            actions.push(
214                "Flaky: Consider increasing retries or stabilizing the environment".to_string(),
215            );
216        }
217        if rates.fail > 0.2 {
218            actions.push("High failure rate: Investigate top reasons".to_string());
219        }
220        // Check for low P10 in key metrics
221        for (m, stats) in &score_stats {
222            if stats.p10 < 0.6 {
223                // Heuristic threshold
224                actions.push(format!(
225                    "Low {} scores (P10 < 0.6): Consider tuning min_score or improving prompts",
226                    m
227                ));
228            }
229        }
230
231        tests.push(TestHygiene {
232            test_id,
233            n,
234            rates,
235            scores: score_stats,
236            top_reasons,
237            suggested_actions: actions,
238        });
239    }
240
241    // Sort tests by fail rate descending (problematic first)
242    tests.sort_by(|a, b| {
243        b.rates
244            .fail
245            .partial_cmp(&a.rates.fail)
246            .unwrap_or(std::cmp::Ordering::Equal)
247    });
248
249    // Global notes
250    if tests.iter().any(|t| t.rates.skipped > 0.5) {
251        notes.push("High skip rate (>50%) detected in some tests. Check for over-aggressive fingerprinting.".to_string());
252    }
253
254    Ok(HygieneReport {
255        schema_version: 1,
256        suite: suite.to_string(),
257        source: "eval.db".to_string(),
258        score_source: "all_attempts".to_string(),
259        generated_at: chrono::Utc::now().to_rfc3339(),
260        window: ReportWindow { last_runs },
261        tests,
262        notes,
263    })
264}
assay_core/baseline/report.rs

assay_core/baseline/
report.rs