1mod executor;
14mod generator;
15mod git_revision;
16mod sandbox;
17mod tracer;
18
19use agentcarousel_core::{
20 new_run_id, Case, CaseResult, CaseStatus, EvalScores, FixtureFile, OverallStatus,
21 ProviderErrorMetrics, RubricScore, Run, RunSummary,
22};
23use agentcarousel_evaluators::{
24 Evaluator, EvaluatorError, EvaluatorKind, GoldenEvaluator, JudgeEvaluator, ProcessEvaluator,
25 RulesEvaluator,
26};
27use agentcarousel_fixtures::MockEngine;
28use chrono::Utc;
29use indicatif::{ProgressBar, ProgressStyle};
30use std::collections::{HashMap, HashSet};
31use std::path::PathBuf;
32use std::sync::Arc;
33use std::time::Duration;
34use tokio::sync::{Mutex, Semaphore};
35
36pub use executor::run_case;
37pub use generator::GeneratorProvider;
38pub use sandbox::SandboxError;
39pub use tracer::SecretScrubber;
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum GenerationMode {
44 MockOnly,
46 Live,
48}
49
50#[derive(Debug, Clone)]
52pub struct RunnerConfig {
53 pub concurrency: usize,
54 pub timeout_secs: u64,
55 pub offline: bool,
56 pub mock_dir: PathBuf,
57 pub generation_mode: GenerationMode,
58 pub generator_model: Option<String>,
59 pub generator_max_tokens: Option<u32>,
60 pub fail_fast: bool,
61 pub mock_strict: bool,
62 pub command: String,
63 pub agentcarousel_version: String,
64 pub config_hash: String,
65 pub run_id: Option<String>,
66}
67
68#[derive(Debug, Clone)]
71pub struct EvalConfig {
72 pub runner: RunnerConfig,
73 pub runs: u32,
74 pub seed: u64,
75 pub evaluator: String,
76 pub judge: bool,
77 pub judge_model: Option<String>,
78 pub judge_max_tokens: Option<u32>,
79 pub effectiveness_threshold: f32,
80 pub certification_context: Option<agentcarousel_core::CertificationContext>,
81 pub carousel_iteration: Option<u32>,
82 pub policy_version: Option<String>,
83 pub progress: bool,
85}
86
87pub async fn run_fixtures(fixtures: Vec<FixtureFile>, config: RunnerConfig) -> Run {
89 let started_at = Utc::now();
90 let (fixture_bundle_id, fixture_bundle_version) = bundle_metadata(&fixtures);
91 let run_id = config
92 .run_id
93 .as_ref()
94 .map(|id| agentcarousel_core::RunId(id.clone()))
95 .unwrap_or_else(new_run_id);
96 let mock_engine = MockEngine::load_dir(&config.mock_dir).unwrap_or_default();
97 let skill_or_agent = skill_display_label(&fixtures);
98 let cases = flatten_cases(fixtures);
99
100 let results = if config.fail_fast {
101 run_sequential(cases, &mock_engine, &config).await
102 } else {
103 run_parallel(cases, &mock_engine, &config).await
104 };
105
106 let summary = build_summary(&results);
107 let git_sha = git_revision::resolve_git_sha();
108
109 Run {
110 id: run_id,
111 schema_version: 1,
112 started_at,
113 finished_at: Some(Utc::now()),
114 command: config.command,
115 git_sha,
116 agentcarousel_version: config.agentcarousel_version,
117 config_hash: config.config_hash,
118 cases: results,
119 summary,
120 fixture_bundle_id,
121 fixture_bundle_version,
122 carousel_iteration: None,
123 certification_context: None,
124 policy_version: None,
125 skill_or_agent,
126 runner_offline: config.offline,
127 runner_mock_strict: config.mock_strict,
128 runner_mock_only: config.generation_mode == GenerationMode::MockOnly,
129 }
130}
131
132pub async fn run_eval(fixtures: Vec<FixtureFile>, config: EvalConfig) -> Run {
135 let started_at = Utc::now();
136 let (fixture_bundle_id, fixture_bundle_version) = bundle_metadata(&fixtures);
137 let run_id = config
138 .runner
139 .run_id
140 .as_ref()
141 .map(|id| agentcarousel_core::RunId(id.clone()))
142 .unwrap_or_else(new_run_id);
143 let mock_engine = MockEngine::load_dir(&config.runner.mock_dir).unwrap_or_default();
144 let skill_or_agent = skill_display_label(&fixtures);
145 let cases = flatten_cases(fixtures);
146 let judge_cache = Arc::new(Mutex::new(HashMap::new()));
147
148 let results = run_eval_cases(cases, &mock_engine, &config, &run_id, judge_cache).await;
149 let summary = build_summary(&results);
150 let git_sha = git_revision::resolve_git_sha();
151
152 Run {
153 id: run_id,
154 schema_version: 1,
155 started_at,
156 finished_at: Some(Utc::now()),
157 command: config.runner.command,
158 git_sha,
159 agentcarousel_version: config.runner.agentcarousel_version,
160 config_hash: config.runner.config_hash,
161 cases: results,
162 summary,
163 fixture_bundle_id,
164 fixture_bundle_version,
165 carousel_iteration: config.carousel_iteration,
166 certification_context: config.certification_context,
167 policy_version: config.policy_version,
168 skill_or_agent,
169 runner_offline: config.runner.offline,
170 runner_mock_strict: config.runner.mock_strict,
171 runner_mock_only: config.runner.generation_mode == GenerationMode::MockOnly,
172 }
173}
174
175fn skill_display_label(fixtures: &[FixtureFile]) -> Option<String> {
176 let mut names: Vec<String> = fixtures.iter().map(|f| f.skill_or_agent.clone()).collect();
177 names.sort();
178 names.dedup();
179 match names.len() {
180 0 => None,
181 1 => Some(names[0].clone()),
182 _ => Some(names.join(", ")),
183 }
184}
185
186fn bundle_metadata(fixtures: &[FixtureFile]) -> (Option<String>, Option<String>) {
187 let mut bundle_ids = HashSet::new();
189 let mut bundle_versions = HashSet::new();
190 for fixture in fixtures {
191 if let Some(bundle_id) = fixture.bundle_id.as_ref() {
192 bundle_ids.insert(bundle_id.clone());
193 }
194 if let Some(bundle_version) = fixture.bundle_version.as_ref() {
195 bundle_versions.insert(bundle_version.clone());
196 }
197 }
198 let bundle_id = if bundle_ids.len() == 1 {
199 bundle_ids.into_iter().next()
200 } else {
201 None
202 };
203 let bundle_version = if bundle_versions.len() == 1 {
204 bundle_versions.into_iter().next()
205 } else {
206 None
207 };
208 (bundle_id, bundle_version)
209}
210
211async fn run_sequential(
212 cases: Vec<Case>,
213 mock_engine: &MockEngine,
214 config: &RunnerConfig,
215) -> Vec<CaseResult> {
216 let mut results = Vec::new();
217 for case in cases {
218 let case_id = case.id.clone();
219 let timeout = tokio::time::timeout(
220 std::time::Duration::from_secs(case.timeout_secs.unwrap_or(config.timeout_secs)),
221 executor::run_case(case, mock_engine, config),
222 )
223 .await;
224 let result = match timeout {
225 Ok(result) => result,
226 Err(_) => executor::timeout_result(case_id),
227 };
228 let should_stop = result.status != agentcarousel_core::CaseStatus::Passed;
229 results.push(result);
230 if config.fail_fast && should_stop {
231 break;
232 }
233 }
234 results
235}
236
237async fn run_parallel(
238 cases: Vec<Case>,
239 mock_engine: &MockEngine,
240 config: &RunnerConfig,
241) -> Vec<CaseResult> {
242 let concurrency = std::cmp::max(1, config.concurrency);
243 let semaphore = Arc::new(Semaphore::new(concurrency));
244 let mut handles: Vec<(agentcarousel_core::CaseId, _)> = Vec::new();
245
246 for case in cases {
247 let permit = semaphore.clone().acquire_owned().await.unwrap();
248 let mock_engine = mock_engine.clone();
249 let config = config.clone();
250 let case_id = case.id.clone();
251 let case_id_for_tuple = case_id.clone();
252 let handle = tokio::spawn(async move {
253 let _permit = permit;
254 let timeout = tokio::time::timeout(
255 std::time::Duration::from_secs(case.timeout_secs.unwrap_or(config.timeout_secs)),
256 executor::run_case(case, &mock_engine, &config),
257 )
258 .await;
259 match timeout {
260 Ok(result) => result,
261 Err(_) => executor::timeout_result(case_id),
262 }
263 });
264 handles.push((case_id_for_tuple, handle));
265 }
266
267 let mut results = Vec::new();
268 for (case_id, handle) in handles {
269 match handle.await {
270 Ok(result) => results.push(result),
271 Err(err) => results.push(CaseResult {
272 case_id,
273 status: CaseStatus::Error,
274 error: Some(format!("task panicked: {err}")),
275 trace: agentcarousel_core::ExecutionTrace {
276 steps: Vec::new(),
277 final_output: None,
278 redacted: false,
279 },
280 metrics: agentcarousel_core::Metrics::default(),
281 eval_scores: None,
282 }),
283 }
284 }
285 results
286}
287
288async fn run_eval_cases(
289 cases: Vec<Case>,
290 mock_engine: &MockEngine,
291 config: &EvalConfig,
292 run_id: &agentcarousel_core::RunId,
293 judge_cache: Arc<Mutex<HashMap<String, EvalScores>>>,
294) -> Vec<CaseResult> {
295 let progress_bar: Option<ProgressBar> = if config.progress && !cases.is_empty() {
296 let pb = ProgressBar::new(cases.len() as u64);
297 pb.set_style(
298 ProgressStyle::with_template(
299 "{spinner:.green} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {pos}/{len} cases {msg}",
300 )
301 .expect("progress template")
302 .tick_chars("⠁⠂⠄⡀⢀⠠⠐⠈ "),
303 );
304 pb.set_message("");
305 pb.enable_steady_tick(Duration::from_millis(120));
306 Some(pb)
307 } else {
308 None
309 };
310
311 let concurrency = std::cmp::max(1, config.runner.concurrency);
312 let semaphore = Arc::new(Semaphore::new(concurrency));
313 let mut handles = Vec::new();
314
315 for case in cases {
316 let permit = semaphore.clone().acquire_owned().await.unwrap();
317 let mock_engine = mock_engine.clone();
318 let config = config.clone();
319 let run_id = run_id.clone();
320 let judge_cache = judge_cache.clone();
321 let pb = progress_bar.clone();
322 handles.push(tokio::spawn(async move {
323 let _permit = permit;
324 let result = run_case_eval(case, &mock_engine, &config, &run_id, judge_cache).await;
325 if let Some(pb) = pb {
326 pb.inc(1);
327 }
328 result
329 }));
330 }
331
332 let mut results = Vec::new();
333 for handle in handles {
334 if let Ok(result) = handle.await {
335 results.push(result);
336 }
337 }
338 if let Some(pb) = progress_bar {
339 pb.finish_and_clear();
340 }
341 results
342}
343
344async fn run_case_eval(
345 case: Case,
346 mock_engine: &MockEngine,
347 config: &EvalConfig,
348 run_id: &agentcarousel_core::RunId,
349 judge_cache: Arc<Mutex<HashMap<String, EvalScores>>>,
350) -> CaseResult {
351 let runs = std::cmp::max(1, config.runs);
352 let mut per_run_results = Vec::new();
353 let base_seed = case.seed.unwrap_or(config.seed);
354
355 for run_index in 0..runs {
356 let mut run_case = case.clone();
357 run_case.seed = Some(base_seed.wrapping_add(run_index as u64));
358 let mut result = executor::run_case_unscored(run_case, mock_engine, &config.runner).await;
359
360 if result.status == CaseStatus::Passed {
361 match evaluate_case_result(&case, &result, config, run_id, &judge_cache).await {
362 Ok(scores) => {
363 result.eval_scores = Some(scores.clone());
364 if scores.effectiveness_score < config.effectiveness_threshold {
365 result.status = CaseStatus::Failed;
366 result.error = Some(format!(
367 "effectiveness {:.2} below threshold {:.2}",
368 scores.effectiveness_score, config.effectiveness_threshold
369 ));
370 }
371 }
372 Err(err) => {
373 result.status = CaseStatus::Error;
374 result.error = Some(err.to_string());
375 }
376 }
377 }
378
379 apply_provider_error_metrics(&mut result);
380 per_run_results.push(result);
381 }
382
383 aggregate_case_results(
384 &case,
385 &per_run_results,
386 runs,
387 config.effectiveness_threshold,
388 )
389}
390
391async fn evaluate_case_result(
392 case: &Case,
393 result: &CaseResult,
394 config: &EvalConfig,
395 run_id: &agentcarousel_core::RunId,
396 judge_cache: &Arc<Mutex<HashMap<String, EvalScores>>>,
397) -> Result<EvalScores, EvaluatorError> {
398 let evaluator_id = resolve_evaluator_id(case, config);
399 match evaluator_id.as_str() {
400 "rules" => RulesEvaluator.evaluate(case, result),
401 "golden" => GoldenEvaluator::from_case(case)?.evaluate(case, result),
402 "process" => ProcessEvaluator::from_case(case)?.evaluate(case, result),
403 "judge" => {
404 if !config.judge {
405 return Err(EvaluatorError::MissingConfig(
406 "--judge must be enabled when judge evaluator is selected",
407 ));
408 }
409 let cache_key = format!("{}:{}", run_id.0, case.id.0);
410 if let Some(cached) = judge_cache.lock().await.get(&cache_key).cloned() {
411 return Ok(cached);
412 }
413 let evaluator = JudgeEvaluator::from_case(
414 case,
415 config.judge_model.as_deref(),
416 config.judge_max_tokens,
417 )?;
418 let scores = evaluator.evaluate(case, result)?;
419 judge_cache.lock().await.insert(cache_key, scores.clone());
420 Ok(scores)
421 }
422 other => Err(EvaluatorError::UnknownEvaluator(other.to_string())),
423 }
424}
425
426fn resolve_evaluator_id(case: &Case, config: &EvalConfig) -> String {
427 if config.evaluator == "all" {
428 case.evaluator_config
429 .as_ref()
430 .map(|config| config.evaluator.clone())
431 .unwrap_or_else(|| EvaluatorKind::Rules.as_str().to_string())
432 } else {
433 config.evaluator.clone()
434 }
435}
436
437fn aggregate_case_results(
438 case: &Case,
439 results: &[CaseResult],
440 runs: u32,
441 effectiveness_threshold: f32,
442) -> CaseResult {
443 let status = aggregate_status(results);
444 let metrics = aggregate_metrics(results, runs);
445 let eval_scores = aggregate_eval_scores(results, effectiveness_threshold);
446 let representative = results
447 .iter()
448 .find(|result| result.status == CaseStatus::Passed)
449 .unwrap_or_else(|| results.first().expect("at least one run"));
450
451 let error = if status == CaseStatus::Flaky {
452 Some("inconsistent results across runs".to_string())
453 } else {
454 representative.error.clone()
455 };
456
457 CaseResult {
458 case_id: case.id.clone(),
459 status,
460 error,
461 trace: representative.trace.clone(),
462 metrics,
463 eval_scores,
464 }
465}
466
467fn aggregate_status(results: &[CaseResult]) -> CaseStatus {
468 let unique: HashSet<CaseStatus> = results.iter().map(|result| result.status.clone()).collect();
469 if unique.len() == 1 {
470 unique.into_iter().next().unwrap_or(CaseStatus::Error)
471 } else {
472 CaseStatus::Flaky
473 }
474}
475
476fn aggregate_metrics(results: &[CaseResult], runs: u32) -> agentcarousel_core::Metrics {
477 let mut metrics = agentcarousel_core::Metrics::default();
478 let count = results.len() as u64;
479 if count == 0 {
480 return metrics;
481 }
482
483 let sum_latency: u64 = results
484 .iter()
485 .map(|result| result.metrics.total_latency_ms)
486 .sum();
487 let sum_llm: u32 = results.iter().map(|result| result.metrics.llm_calls).sum();
488 let sum_tool: u32 = results.iter().map(|result| result.metrics.tool_calls).sum();
489 let sum_steps: u32 = results
490 .iter()
491 .map(|result| result.metrics.total_steps)
492 .sum();
493
494 let (tokens_in_sum, tokens_in_count) = sum_optional_u64(results, |metrics| metrics.tokens_in);
495 let (tokens_out_sum, tokens_out_count) =
496 sum_optional_u64(results, |metrics| metrics.tokens_out);
497 let (cost_sum, cost_count) = sum_optional_f64(results, |metrics| metrics.estimated_cost_usd);
498
499 let mean_latency = sum_latency as f64 / count as f64;
500 metrics.total_latency_ms = mean_latency.round() as u64;
501 metrics.llm_calls = sum_llm / count as u32;
502 metrics.tool_calls = sum_tool / count as u32;
503 metrics.total_steps = sum_steps / count as u32;
504 metrics.tokens_in = tokens_in_count.map(|count| tokens_in_sum / count);
505 metrics.tokens_out = tokens_out_count.map(|count| tokens_out_sum / count);
506 metrics.estimated_cost_usd = cost_count.map(|count| cost_sum / count as f64);
507 if count > 1 {
508 let latency_variance = results
509 .iter()
510 .map(|result| {
511 let diff = result.metrics.total_latency_ms as f64 - mean_latency;
512 diff * diff
513 })
514 .sum::<f64>()
515 / count as f64;
516 metrics.latency_variance_ms2 = Some(latency_variance);
517 metrics.latency_stddev_ms = Some(latency_variance.sqrt());
518 }
519 let (effectiveness_variance, effectiveness_stddev) = effectiveness_variance_stats(results);
520 metrics.effectiveness_variance = effectiveness_variance;
521 metrics.effectiveness_stddev = effectiveness_stddev;
522 metrics.runs_attempted = runs;
523 metrics.runs_succeeded = results
524 .iter()
525 .filter(|result| result.status == CaseStatus::Passed)
526 .count() as u32;
527 if runs > 0 {
528 metrics.error_rate =
529 Some(1.0 - (metrics.runs_succeeded as f32 / metrics.runs_attempted as f32));
530 }
531 metrics.consistency_score = Some(consistency_score(results));
532 metrics.provider_errors = sum_provider_errors(results);
533 metrics
534}
535
536fn sum_optional_u64(
537 results: &[CaseResult],
538 getter: fn(&agentcarousel_core::Metrics) -> Option<u64>,
539) -> (u64, Option<u64>) {
540 let mut sum = 0;
541 let mut count = 0;
542 for result in results {
543 if let Some(value) = getter(&result.metrics) {
544 sum += value;
545 count += 1;
546 }
547 }
548 if count == 0 {
549 (0, None)
550 } else {
551 (sum, Some(count))
552 }
553}
554
555fn sum_optional_f64(
556 results: &[CaseResult],
557 getter: fn(&agentcarousel_core::Metrics) -> Option<f64>,
558) -> (f64, Option<u64>) {
559 let mut sum = 0.0;
560 let mut count = 0;
561 for result in results {
562 if let Some(value) = getter(&result.metrics) {
563 sum += value;
564 count += 1;
565 }
566 }
567 if count == 0 {
568 (0.0, None)
569 } else {
570 (sum, Some(count))
571 }
572}
573
574fn effectiveness_variance_stats(results: &[CaseResult]) -> (Option<f32>, Option<f32>) {
575 let mut sum = 0.0_f64;
576 let mut sum_sq = 0.0_f64;
577 let mut count = 0.0_f64;
578 for result in results {
579 if let Some(scores) = result.eval_scores.as_ref() {
580 let value = scores.effectiveness_score as f64;
581 sum += value;
582 sum_sq += value * value;
583 count += 1.0;
584 }
585 }
586 if count <= 1.0 {
587 return (None, None);
588 }
589 let mean = sum / count;
590 let variance = (sum_sq / count) - (mean * mean);
591 let variance = variance.max(0.0);
592 let stddev = variance.sqrt();
593 (Some(variance as f32), Some(stddev as f32))
594}
595
596fn sum_provider_errors(results: &[CaseResult]) -> ProviderErrorMetrics {
597 let mut metrics = ProviderErrorMetrics::default();
598 for result in results {
599 metrics.status_429 += result.metrics.provider_errors.status_429;
600 metrics.status_500 += result.metrics.provider_errors.status_500;
601 metrics.status_503 += result.metrics.provider_errors.status_503;
602 metrics.status_504 += result.metrics.provider_errors.status_504;
603 }
604 metrics
605}
606
607fn apply_provider_error_metrics(result: &mut CaseResult) {
608 let Some(error) = result.error.as_deref() else {
609 return;
610 };
611 let Some(status) = extract_http_status(error) else {
612 return;
613 };
614 match status {
615 429 => result.metrics.provider_errors.status_429 += 1,
616 500 => result.metrics.provider_errors.status_500 += 1,
617 503 => result.metrics.provider_errors.status_503 += 1,
618 504 => result.metrics.provider_errors.status_504 += 1,
619 _ => {}
620 }
621}
622
623fn extract_http_status(error: &str) -> Option<u16> {
624 let candidates = [429_u16, 500, 503, 504];
625 for code in candidates {
626 let code_str = code.to_string();
627 let patterns = [
628 format!("({code_str}"),
629 format!(" {code_str} "),
630 format!(" {code_str}:"),
631 format!(" {code_str})"),
632 ];
633 if patterns.iter().any(|pattern| error.contains(pattern)) {
634 return Some(code);
635 }
636 }
637 None
638}
639
640fn aggregate_eval_scores(
641 results: &[CaseResult],
642 effectiveness_threshold: f32,
643) -> Option<EvalScores> {
644 let collected: Vec<&EvalScores> = results
645 .iter()
646 .filter_map(|result| result.eval_scores.as_ref())
647 .collect();
648 if collected.is_empty() {
649 return None;
650 }
651
652 let evaluator = collected
653 .first()
654 .map(|scores| scores.evaluator.clone())
655 .unwrap_or_else(|| EvaluatorKind::Rules.as_str().to_string());
656 let effectiveness_score = collected
657 .iter()
658 .map(|scores| scores.effectiveness_score)
659 .sum::<f32>()
660 / collected.len() as f32;
661
662 let mut rubric_map: HashMap<String, (f32, f32, u32, Option<String>)> = HashMap::new();
663 for scores in &collected {
664 for rubric in &scores.rubric_scores {
665 let entry =
666 rubric_map
667 .entry(rubric.rubric_id.clone())
668 .or_insert((0.0, rubric.weight, 0, None));
669 entry.0 += rubric.score;
670 entry.2 += 1;
671 if entry.3.is_none() {
672 entry.3 = rubric.rationale.clone();
673 }
674 }
675 }
676 let rubric_scores = rubric_map
677 .into_iter()
678 .map(
679 |(rubric_id, (sum_score, weight, count, rationale))| RubricScore {
680 rubric_id,
681 score: if count == 0 {
682 0.0
683 } else {
684 sum_score / count as f32
685 },
686 weight,
687 rationale,
688 },
689 )
690 .collect();
691
692 let judge_rationale = collected
693 .iter()
694 .find_map(|scores| scores.judge_rationale.clone());
695
696 Some(EvalScores {
697 evaluator,
698 rubric_scores,
699 effectiveness_score,
700 passed: effectiveness_score >= effectiveness_threshold,
701 judge_rationale,
702 })
703}
704
705fn consistency_score(results: &[CaseResult]) -> f32 {
706 if results.len() <= 1 {
707 return 1.0;
708 }
709 let mut counts: HashMap<String, u32> = HashMap::new();
710 for result in results {
711 let signature = format!(
712 "{:?}|{}",
713 result.status,
714 result.trace.final_output.clone().unwrap_or_default()
715 );
716 *counts.entry(signature).or_insert(0) += 1;
717 }
718 let max = counts.values().copied().max().unwrap_or(0) as f32;
719 max / results.len() as f32
720}
721
722fn flatten_cases(fixtures: Vec<FixtureFile>) -> Vec<Case> {
723 let mut cases = Vec::new();
724 for fixture in fixtures {
725 let defaults = fixture.defaults.clone();
726 for mut case in fixture.cases {
727 apply_defaults(&mut case, &defaults);
728 cases.push(case);
729 }
730 }
731 cases
732}
733
734fn apply_defaults(case: &mut Case, defaults: &Option<agentcarousel_core::CaseDefaults>) {
735 if let Some(defaults) = defaults {
736 if case.timeout_secs.is_none() {
737 case.timeout_secs = defaults.timeout_secs;
738 }
739 if case.tags.is_empty() {
740 if let Some(tags) = defaults.tags.as_ref() {
741 case.tags = tags.clone();
742 }
743 }
744 if case.evaluator_config.is_none() {
745 if let Some(evaluator) = defaults.evaluator.as_ref() {
746 case.evaluator_config = Some(agentcarousel_core::EvaluatorConfig {
747 evaluator: evaluator.clone(),
748 golden_path: None,
749 golden_threshold: None,
750 process_cmd: None,
751 judge_prompt: None,
752 });
753 }
754 }
755 }
756}
757
758fn build_summary(results: &[CaseResult]) -> RunSummary {
759 let total = results.len() as u32;
760 let mut passed = 0;
761 let mut failed = 0;
762 let mut skipped = 0;
763 let mut flaky = 0;
764 let mut errored = 0;
765 let mut timed_out = 0;
766 let mut latency_sum = 0u64;
767 let mut effectiveness_sum = 0.0;
768 let mut effectiveness_count = 0u32;
769 let mut provider_errors = ProviderErrorMetrics::default();
770
771 for result in results {
772 latency_sum += result.metrics.total_latency_ms;
773 provider_errors.status_429 += result.metrics.provider_errors.status_429;
774 provider_errors.status_500 += result.metrics.provider_errors.status_500;
775 provider_errors.status_503 += result.metrics.provider_errors.status_503;
776 provider_errors.status_504 += result.metrics.provider_errors.status_504;
777 if let Some(scores) = result.eval_scores.as_ref() {
778 effectiveness_sum += scores.effectiveness_score;
779 effectiveness_count += 1;
780 }
781 match result.status {
782 agentcarousel_core::CaseStatus::Passed => passed += 1,
783 agentcarousel_core::CaseStatus::Failed => failed += 1,
784 agentcarousel_core::CaseStatus::Skipped => skipped += 1,
785 agentcarousel_core::CaseStatus::Flaky => flaky += 1,
786 agentcarousel_core::CaseStatus::TimedOut => timed_out += 1,
787 agentcarousel_core::CaseStatus::Error => errored += 1,
788 }
789 }
790
791 let effective_total = total.saturating_sub(flaky);
792 let pass_rate = if effective_total == 0 {
793 0.0
794 } else {
795 passed as f32 / effective_total as f32
796 };
797 let mean_latency_ms = if total == 0 {
798 0.0
799 } else {
800 latency_sum as f64 / total as f64
801 };
802 let mean_effectiveness_score = if effectiveness_count == 0 {
803 None
804 } else {
805 Some(effectiveness_sum / effectiveness_count as f32)
806 };
807 let overall_status = if failed == 0 && timed_out == 0 && errored == 0 && flaky == 0 {
808 OverallStatus::Pass
809 } else {
810 OverallStatus::Fail
811 };
812
813 RunSummary {
814 total,
815 passed,
816 failed,
817 skipped,
818 flaky,
819 errored,
820 timed_out,
821 pass_rate,
822 mean_latency_ms,
823 mean_effectiveness_score,
824 provider_errors,
825 overall_status,
826 }
827}