agentics_domain/models/
evaluation.rs

1use std::cmp::Ordering;
2use std::collections::HashSet;
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7use super::challenge::{MetricDirection, MetricSchemaSpec, MetricVisibility};
8use super::hashes::Sha256Digest;
9use super::ids::{EvaluationId, EvaluationJobId};
10use super::names::{ChallengeName, MetricName, RunName, TargetName};
11use crate::storage::StorageKey;
12
13/// Evaluation surface requested for a solution submission.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
15pub enum ScoringMode {
16    /// Private validation scoring, backed by public challenge data.
17    #[serde(rename = "validation")]
18    Validation,
19    /// Ranking-visible official scoring, backed by private benchmark data.
20    #[serde(rename = "official")]
21    Official,
22}
23
24impl ScoringMode {
25    /// Canonical persisted and API value for this mode.
26    pub fn as_str(self) -> &'static str {
27        match self {
28            Self::Validation => "validation",
29            Self::Official => "official",
30        }
31    }
32
33    /// Parse canonical persisted values.
34    pub fn from_storage_value(value: &str) -> Option<Self> {
35        match value {
36            "validation" => Some(Self::Validation),
37            "official" => Some(Self::Official),
38            _ => None,
39        }
40    }
41
42    /// Argument passed to the evaluator protocol.
43    pub fn evaluator_mode_arg(self) -> &'static str {
44        match self {
45            Self::Validation => "validation",
46            Self::Official => "official",
47        }
48    }
49}
50
51impl fmt::Display for ScoringMode {
52    /// Format the mode as its stable persisted and wire value.
53    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54        f.write_str(self.as_str())
55    }
56}
57
58/// Controls how much per-case detail a dataset may expose.
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
60#[serde(rename_all = "snake_case")]
61pub enum ScoreVisibility {
62    Full,
63    ScoreOnly,
64}
65
66/// Per-case evaluator outcome for public validation tests.
67#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
68#[serde(rename_all = "snake_case")]
69pub enum EvaluatorCaseStatus {
70    Passed,
71    Failed,
72    Error,
73}
74
75/// Overall evaluator outcome emitted by `result.json`.
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
77#[serde(rename_all = "snake_case")]
78pub enum EvaluatorRunStatus {
79    Passed,
80    Failed,
81    Error,
82}
83
84/// Persistent lifecycle state for an evaluation job/result.
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
86#[serde(rename_all = "snake_case")]
87pub enum EvaluationStatus {
88    Queued,
89    Running,
90    Completed,
91    Failed,
92}
93
94impl EvaluationStatus {
95    /// Stable database string for an evaluation lifecycle state.
96    pub fn as_str(self) -> &'static str {
97        match self {
98            Self::Queued => "queued",
99            Self::Running => "running",
100            Self::Completed => "completed",
101            Self::Failed => "failed",
102        }
103    }
104
105    /// Parse a stable database string for an evaluation lifecycle state.
106    pub fn from_storage_value(value: &str) -> Option<Self> {
107        match value {
108            "queued" => Some(Self::Queued),
109            "running" => Some(Self::Running),
110            "completed" => Some(Self::Completed),
111            "failed" => Some(Self::Failed),
112            _ => None,
113        }
114    }
115}
116
117impl fmt::Display for EvaluationStatus {
118    /// Format the evaluation status as its stable persisted and wire value.
119    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
120        f.write_str(self.as_str())
121    }
122}
123
124/// Persistent lifecycle state for an evaluation job.
125#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
126#[serde(rename_all = "snake_case")]
127pub enum EvaluationJobStatus {
128    Staged,
129    Queued,
130    Running,
131    Completed,
132    Failed,
133}
134
135impl EvaluationJobStatus {
136    /// Stable database string for an evaluation job lifecycle state.
137    pub fn as_str(self) -> &'static str {
138        match self {
139            Self::Staged => "staged",
140            Self::Queued => "queued",
141            Self::Running => "running",
142            Self::Completed => "completed",
143            Self::Failed => "failed",
144        }
145    }
146
147    /// Parse a stable database string for an evaluation job lifecycle state.
148    pub fn from_storage_value(value: &str) -> Option<Self> {
149        match value {
150            "staged" => Some(Self::Staged),
151            "queued" => Some(Self::Queued),
152            "running" => Some(Self::Running),
153            "completed" => Some(Self::Completed),
154            "failed" => Some(Self::Failed),
155            _ => None,
156        }
157    }
158}
159
160impl fmt::Display for EvaluationJobStatus {
161    /// Format the job status as its stable persisted and wire value.
162    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
163        f.write_str(self.as_str())
164    }
165}
166
167/// Persistent lifecycle state for a solution submission.
168#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
169#[serde(rename_all = "snake_case")]
170pub enum SolutionSubmissionStatus {
171    Pending,
172    Queued,
173    Running,
174    Completed,
175    Failed,
176}
177
178impl SolutionSubmissionStatus {
179    /// Stable database string for a solution-submission lifecycle state.
180    pub fn as_str(self) -> &'static str {
181        match self {
182            Self::Pending => "pending",
183            Self::Queued => "queued",
184            Self::Running => "running",
185            Self::Completed => "completed",
186            Self::Failed => "failed",
187        }
188    }
189
190    /// Parse a stable database string for a solution-submission lifecycle state.
191    pub fn from_storage_value(value: &str) -> Option<Self> {
192        match value {
193            "pending" => Some(Self::Pending),
194            "queued" => Some(Self::Queued),
195            "running" => Some(Self::Running),
196            "completed" => Some(Self::Completed),
197            "failed" => Some(Self::Failed),
198            _ => None,
199        }
200    }
201}
202
203impl fmt::Display for SolutionSubmissionStatus {
204    /// Format the submission status as its stable persisted and wire value.
205    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
206        f.write_str(self.as_str())
207    }
208}
209
210/// Aggregate score summary for validation or official datasets.
211#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
212pub struct ScoreSummary {
213    /// Challenge-defined finite score summary.
214    pub score: f64,
215    /// Number of passed cases in the aggregate.
216    pub passed: i64,
217    /// Total number of cases in the aggregate.
218    pub total: i64,
219}
220
221/// Public per-case result exposed for validation feedback.
222#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
223pub struct PublicCaseResult {
224    pub case_name: String,
225    pub status: EvaluatorCaseStatus,
226    pub score: f64,
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub message: Option<String>,
229}
230
231/// Numeric value for one declared metric.
232#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
233pub struct MetricValue {
234    pub metric_name: MetricName,
235    pub value: f64,
236}
237
238/// Metric values for one evaluator-defined run, case, seed, shard, or scenario.
239#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
240pub struct RunMetricResult {
241    pub run_name: RunName,
242    #[serde(default)]
243    #[schemars(required)]
244    pub metrics: Vec<MetricValue>,
245}
246
247/// API DTO for a persisted evaluation.
248#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
249pub struct EvaluationDto {
250    pub id: EvaluationId,
251    pub target: TargetName,
252    pub status: EvaluationStatus,
253    pub eval_type: ScoringMode,
254    pub aggregate_metrics: Vec<MetricValue>,
255    pub run_metrics: Vec<RunMetricResult>,
256    pub public_results: Vec<PublicCaseResult>,
257    #[serde(skip_serializing_if = "Option::is_none")]
258    pub validation_summary: Option<ScoreSummary>,
259    #[serde(skip_serializing_if = "Option::is_none")]
260    pub official_summary: Option<ScoreSummary>,
261    #[serde(skip_serializing_if = "Option::is_none")]
262    pub runner_log_storage_key: Option<StorageKey>,
263    #[serde(skip_serializing_if = "Option::is_none")]
264    pub started_at: Option<String>,
265    #[serde(skip_serializing_if = "Option::is_none")]
266    pub finished_at: Option<String>,
267}
268
269/// Raw evaluator output read from a runner container's `result.json`.
270///
271/// Optional fields match the relaxed JSON contract used by the rewrite:
272/// absent nullable fields are accepted, but numeric scores and mode-specific
273/// summaries are validated before the result is persisted.
274#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
275#[serde(deny_unknown_fields)]
276pub struct EvaluatorRunResult {
277    pub status: EvaluatorRunStatus,
278    #[serde(skip_serializing_if = "Option::is_none")]
279    pub mode: Option<ScoringMode>,
280    #[serde(default)]
281    pub aggregate_metrics: Vec<MetricValue>,
282    #[serde(default)]
283    pub run_metrics: Vec<RunMetricResult>,
284    #[serde(default)]
285    pub public_results: Vec<PublicCaseResult>,
286    #[serde(default, skip_serializing_if = "Option::is_none")]
287    pub validation_summary: Option<ScoreSummary>,
288    #[serde(default, skip_serializing_if = "Option::is_none")]
289    pub official_summary: Option<ScoreSummary>,
290    #[serde(default)]
291    pub logs: Vec<String>,
292}
293
294impl ScoreSummary {
295    /// Validate finite score and aggregate case counts for a named summary field.
296    pub fn validate(&self, label: &str) -> Result<(), String> {
297        validate_finite_number(self.score, &format!("{label}.score"))?;
298        if self.passed < 0 {
299            return Err(format!("{label}.passed must be >= 0"));
300        }
301        if self.total < 0 {
302            return Err(format!("{label}.total must be >= 0"));
303        }
304        if self.passed > self.total {
305            return Err(format!("{label}.passed cannot be greater than total"));
306        }
307
308        Ok(())
309    }
310}
311
312impl PublicCaseResult {
313    /// Validate the public case name and finite challenge-defined score.
314    pub fn validate(&self) -> Result<(), String> {
315        if self.case_name.trim().is_empty() {
316            return Err("public_results.case_name must not be empty".to_string());
317        }
318        validate_finite_number(self.score, "public_results.score")
319    }
320}
321
322impl MetricValue {
323    /// Validate metric name shape and finite numeric value.
324    pub fn validate(&self, field: &str) -> Result<(), String> {
325        validate_finite_number(self.value, &format!("{field}.value"))
326    }
327
328    /// Find a metric value by name in an evaluator metric payload.
329    pub fn find_by_name(metrics: &[Self], metric_name: &MetricName) -> Option<Self> {
330        metrics
331            .iter()
332            .find(|metric| &metric.metric_name == metric_name)
333            .cloned()
334    }
335}
336
337impl RunMetricResult {
338    /// Validate one per-run metric record without checking challenge-specific names.
339    pub fn validate(&self) -> Result<(), String> {
340        let mut metric_names = HashSet::with_capacity(self.metrics.len());
341        for metric in &self.metrics {
342            metric.validate("run_metrics.metrics")?;
343            if !metric_names.insert(metric.metric_name.as_str()) {
344                return Err(format!(
345                    "run_metrics.metrics contains duplicate metric_name `{}` for run `{}`",
346                    metric.metric_name, self.run_name
347                ));
348            }
349        }
350
351        Ok(())
352    }
353}
354
355impl EvaluatorRunResult {
356    /// Validate platform-owned size limits before result persistence.
357    pub fn validate_size_limits(
358        &self,
359        max_public_results: u64,
360        max_result_log_bytes: u64,
361    ) -> Result<(), String> {
362        let public_result_count = u64::try_from(self.public_results.len())
363            .map_err(|_| "public_results count exceeds supported range".to_string())?;
364        if public_result_count > max_public_results {
365            return Err(format!(
366                "public_results contains too many entries: {public_result_count} > {max_public_results}"
367            ));
368        }
369
370        let mut log_bytes = 0u64;
371        for log in &self.logs {
372            let len = u64::try_from(log.len())
373                .map_err(|_| "result.logs byte length exceeds supported range".to_string())?;
374            log_bytes = log_bytes
375                .checked_add(len)
376                .ok_or_else(|| "result.logs byte length overflow".to_string())?;
377            if log_bytes > max_result_log_bytes {
378                return Err(format!(
379                    "result.logs exceeds byte limit: {log_bytes} > {max_result_log_bytes} bytes"
380                ));
381            }
382        }
383
384        Ok(())
385    }
386
387    /// Validate evaluator output against the evaluation mode that was actually run.
388    ///
389    /// If the evaluator included a `mode`, it must match `mode`.
390    pub fn validate_for_mode(&self, mode: ScoringMode) -> Result<(), String> {
391        if let Some(result_mode) = self.mode
392            && result_mode != mode
393        {
394            return Err("result mode does not match evaluation job type".to_string());
395        }
396
397        validate_metric_values(&self.aggregate_metrics, "aggregate_metrics")?;
398
399        let mut run_names = HashSet::with_capacity(self.run_metrics.len());
400        for run in &self.run_metrics {
401            run.validate()?;
402            if !run_names.insert(run.run_name.as_str()) {
403                return Err(format!(
404                    "run_metrics contains duplicate run_name `{}`",
405                    run.run_name
406                ));
407            }
408        }
409
410        for public_result in &self.public_results {
411            public_result.validate()?;
412        }
413
414        if let Some(validation) = &self.validation_summary {
415            validation.validate("validation_summary")?;
416        }
417        if let Some(official) = &self.official_summary {
418            official.validate("official_summary")?;
419        }
420
421        if self.validation_summary.is_none() && self.official_summary.is_none() {
422            return Err(
423                "validation_summary and official_summary cannot both be absent".to_string(),
424            );
425        }
426        if mode == ScoringMode::Validation && self.validation_summary.is_none() {
427            return Err("validation evaluation requires validation_summary".to_string());
428        }
429        if mode == ScoringMode::Official && self.official_summary.is_none() {
430            return Err("official evaluation requires official_summary".to_string());
431        }
432
433        Ok(())
434    }
435
436    /// Complete metric-derived evaluator fields after schema validation.
437    pub fn complete_metric_result(
438        &mut self,
439        schema: &MetricSchemaSpec,
440        mode: ScoringMode,
441    ) -> Result<(), String> {
442        self.validate_for_metric_schema(schema, mode)
443    }
444
445    /// Validate metric names against the challenge's declared metric schema.
446    pub fn validate_for_metric_schema(
447        &self,
448        schema: &MetricSchemaSpec,
449        mode: ScoringMode,
450    ) -> Result<(), String> {
451        let declared = schema
452            .metrics
453            .iter()
454            .map(|metric| (metric.name.as_str(), metric))
455            .collect::<std::collections::HashMap<_, _>>();
456        if declared.is_empty() {
457            return Err("metric schema must declare at least one metric".to_string());
458        }
459
460        for metric in &self.aggregate_metrics {
461            let Some(definition) = declared.get(metric.metric_name.as_str()) else {
462                return Err(format!(
463                    "aggregate_metrics references unknown metric `{}`",
464                    metric.metric_name
465                ));
466            };
467            validate_metric_visibility(mode, definition.visibility, &metric.metric_name)?;
468        }
469
470        for run in &self.run_metrics {
471            for metric in &run.metrics {
472                let Some(definition) = declared.get(metric.metric_name.as_str()) else {
473                    return Err(format!(
474                        "run_metrics references unknown metric `{}`",
475                        metric.metric_name
476                    ));
477                };
478                validate_metric_visibility(mode, definition.visibility, &metric.metric_name)?;
479            }
480        }
481
482        if mode == ScoringMode::Official
483            && !self
484                .aggregate_metrics
485                .iter()
486                .any(|metric| metric.metric_name == schema.ranking.primary_metric_name)
487        {
488            return Err(format!(
489                "aggregate_metrics missing primary metric `{}`",
490                schema.ranking.primary_metric_name
491            ));
492        }
493
494        Ok(())
495    }
496}
497
498/// Compare two aggregate metric payloads using a challenge's ranking schema.
499///
500/// Returns `Ordering::Less` when `a_metrics` should rank before `b_metrics`.
501pub fn compare_metric_payloads_by_ranking(
502    schema: &MetricSchemaSpec,
503    a_metrics: &[MetricValue],
504    b_metrics: &[MetricValue],
505) -> Ordering {
506    let Some(primary) = schema.primary_metric() else {
507        return Ordering::Equal;
508    };
509    let primary_order = compare_metric_by_direction(
510        primary.direction,
511        metric_value_by_name(a_metrics, &schema.ranking.primary_metric_name),
512        metric_value_by_name(b_metrics, &schema.ranking.primary_metric_name),
513    );
514    if primary_order != Ordering::Equal {
515        return primary_order;
516    }
517
518    for metric_name in &schema.ranking.tie_breaker_metric_names {
519        let Some(definition) = schema.metric(metric_name) else {
520            continue;
521        };
522        let ordering = compare_metric_by_direction(
523            definition.direction,
524            metric_value_by_name(a_metrics, metric_name),
525            metric_value_by_name(b_metrics, metric_name),
526        );
527        if ordering != Ordering::Equal {
528            return ordering;
529        }
530    }
531
532    Ordering::Equal
533}
534
535/// Return one metric value by name from an aggregate metric payload.
536pub fn metric_value_by_name(metrics: &[MetricValue], metric_name: &MetricName) -> Option<f64> {
537    metrics
538        .iter()
539        .find(|metric| &metric.metric_name == metric_name)
540        .map(|metric| metric.value)
541}
542
543/// Compare optional metric values according to the declared direction.
544fn compare_metric_by_direction(
545    direction: MetricDirection,
546    a: Option<f64>,
547    b: Option<f64>,
548) -> Ordering {
549    match (a, b) {
550        (Some(a), Some(b)) => match direction {
551            MetricDirection::Maximize => compare_f64_desc(a, b),
552            MetricDirection::Minimize => compare_f64_asc(a, b),
553        },
554        (Some(_), None) => Ordering::Less,
555        (None, Some(_)) => Ordering::Greater,
556        (None, None) => Ordering::Equal,
557    }
558}
559
560/// Compare finite values in descending order.
561fn compare_f64_desc(a: f64, b: f64) -> Ordering {
562    b.partial_cmp(&a).unwrap_or(Ordering::Equal)
563}
564
565/// Compare finite values in ascending order.
566fn compare_f64_asc(a: f64, b: f64) -> Ordering {
567    a.partial_cmp(&b).unwrap_or(Ordering::Equal)
568}
569
570/// Validates finite number invariants for this contract.
571fn validate_finite_number(value: f64, field: &str) -> Result<(), String> {
572    if !value.is_finite() {
573        return Err(format!("{field} must be finite"));
574    }
575
576    Ok(())
577}
578
579/// Validates metric values invariants for this contract.
580fn validate_metric_values(metrics: &[MetricValue], field: &str) -> Result<(), String> {
581    let mut metric_names = HashSet::with_capacity(metrics.len());
582    for metric in metrics {
583        metric.validate(field)?;
584        if !metric_names.insert(metric.metric_name.as_str()) {
585            return Err(format!(
586                "{field} contains duplicate metric_name `{}`",
587                metric.metric_name
588            ));
589        }
590    }
591
592    Ok(())
593}
594
595/// Validates metric visibility invariants for this contract.
596fn validate_metric_visibility(
597    mode: ScoringMode,
598    visibility: MetricVisibility,
599    metric_name: &MetricName,
600) -> Result<(), String> {
601    if mode == ScoringMode::Validation && visibility == MetricVisibility::Official {
602        return Err(format!(
603            "validation results cannot include official-only metric `{metric_name}`"
604        ));
605    }
606
607    Ok(())
608}
609
610#[cfg(test)]
611mod tests {
612    use crate::models::challenge::{
613        MetricDefinitionSpec, MetricDirection, MetricSchemaSpec, MetricVisibility, RankingSpec,
614    };
615    use crate::models::names::{MetricName, RunName};
616
617    use super::{
618        EvaluatorCaseStatus, EvaluatorRunResult, EvaluatorRunStatus, MetricValue, RunMetricResult,
619        ScoreSummary, ScoringMode,
620    };
621
622    /// Handles metric name for this module.
623    fn metric_name(value: &str) -> MetricName {
624        MetricName::try_new(value.to_string()).expect("test metric name is valid")
625    }
626
627    /// Handles run name for this module.
628    fn run_name(value: &str) -> RunName {
629        RunName::try_new(value.to_string()).expect("test run name is valid")
630    }
631
632    /// Handles valid validation result for this module.
633    fn valid_validation_result() -> EvaluatorRunResult {
634        EvaluatorRunResult {
635            status: EvaluatorRunStatus::Passed,
636            mode: Some(ScoringMode::Validation),
637            aggregate_metrics: vec![],
638            run_metrics: vec![],
639            public_results: vec![],
640            validation_summary: Some(ScoreSummary {
641                score: 1.0,
642                passed: 1,
643                total: 1,
644            }),
645            official_summary: None,
646            logs: vec![],
647        }
648    }
649
650    /// Verifies that evaluator mode mismatch is rejected.
651    #[test]
652    fn evaluator_mode_mismatch_is_rejected() {
653        let mut result = valid_validation_result();
654        result.mode = Some(ScoringMode::Official);
655        result.official_summary = Some(ScoreSummary {
656            score: 1.0,
657            passed: 1,
658            total: 1,
659        });
660
661        assert!(result.validate_for_mode(ScoringMode::Validation).is_err());
662    }
663
664    /// Verifies that evaluator mode can be absent.
665    #[test]
666    fn evaluator_mode_can_be_absent() {
667        let mut result = valid_validation_result();
668        result.mode = None;
669
670        assert!(result.validate_for_mode(ScoringMode::Validation).is_ok());
671    }
672
673    /// Verifies that evaluator output can rely on declared aggregate metrics.
674    #[test]
675    fn evaluator_output_with_declared_metrics_is_valid() {
676        let mut result = valid_validation_result();
677        result
678            .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Validation)
679            .unwrap();
680
681        assert!(result.aggregate_metrics.is_empty());
682    }
683
684    /// Verifies minimized primary metrics rank smaller values first.
685    #[test]
686    fn minimized_primary_metric_ranks_smaller_values_first() {
687        let schema = MetricSchemaSpec {
688            metrics: vec![MetricDefinitionSpec {
689                name: metric_name("latency_ms"),
690                label: "Latency".to_string(),
691                unit: Some("ms".to_string()),
692                direction: MetricDirection::Minimize,
693                visibility: MetricVisibility::Public,
694                metric_description: None,
695            }],
696            ranking: RankingSpec {
697                primary_metric_name: metric_name("latency_ms"),
698                tie_breaker_metric_names: vec![],
699            },
700        };
701        let faster = vec![MetricValue {
702            metric_name: metric_name("latency_ms"),
703            value: 7.0,
704        }];
705        let slower = vec![MetricValue {
706            metric_name: metric_name("latency_ms"),
707            value: 42.0,
708        }];
709
710        assert_eq!(
711            super::compare_metric_payloads_by_ranking(&schema, &faster, &slower),
712            std::cmp::Ordering::Less
713        );
714    }
715
716    /// Verifies maximized primary metrics rank larger values first.
717    #[test]
718    fn maximized_primary_metric_ranks_larger_values_first() {
719        let schema = MetricSchemaSpec::default();
720        let better = vec![MetricValue {
721            metric_name: metric_name("score"),
722            value: 42.0,
723        }];
724        let worse = vec![MetricValue {
725            metric_name: metric_name("score"),
726            value: 7.0,
727        }];
728
729        assert_eq!(
730            super::compare_metric_payloads_by_ranking(&schema, &better, &worse),
731            std::cmp::Ordering::Less
732        );
733    }
734
735    /// Verifies declared tie-breakers are applied after equal primary metrics.
736    #[test]
737    fn ranking_uses_declared_tie_breakers() {
738        let schema = MetricSchemaSpec {
739            metrics: vec![
740                MetricDefinitionSpec {
741                    name: metric_name("score"),
742                    label: "Score".to_string(),
743                    unit: None,
744                    direction: MetricDirection::Maximize,
745                    visibility: MetricVisibility::Public,
746                    metric_description: None,
747                },
748                MetricDefinitionSpec {
749                    name: metric_name("passed_cases"),
750                    label: "Passed Cases".to_string(),
751                    unit: Some("cases".to_string()),
752                    direction: MetricDirection::Maximize,
753                    visibility: MetricVisibility::Public,
754                    metric_description: None,
755                },
756            ],
757            ranking: RankingSpec {
758                primary_metric_name: metric_name("score"),
759                tie_breaker_metric_names: vec![metric_name("passed_cases")],
760            },
761        };
762        let better = vec![
763            MetricValue {
764                metric_name: metric_name("score"),
765                value: 1.0,
766            },
767            MetricValue {
768                metric_name: metric_name("passed_cases"),
769                value: 3.0,
770            },
771        ];
772        let worse = vec![
773            MetricValue {
774                metric_name: metric_name("score"),
775                value: 1.0,
776            },
777            MetricValue {
778                metric_name: metric_name("passed_cases"),
779                value: 1.0,
780            },
781        ];
782
783        assert_eq!(
784            super::compare_metric_payloads_by_ranking(&schema, &better, &worse),
785            std::cmp::Ordering::Less
786        );
787    }
788
789    /// Verifies evaluator outputs cannot keep the removed rank_score field.
790    #[test]
791    fn evaluator_result_rejects_rank_score_field() {
792        let raw = serde_json::json!({
793            "status": "passed",
794            "mode": "validation",
795            "rank_score": 1.0,
796            "validation_summary": { "score": 1.0, "passed": 1, "total": 1 }
797        });
798
799        let error = serde_json::from_value::<EvaluatorRunResult>(raw)
800            .expect_err("rank_score should be rejected as an unknown field");
801        assert!(error.to_string().contains("rank_score"));
802    }
803
804    /// Verifies that unknown aggregate metric is rejected.
805    #[test]
806    fn unknown_aggregate_metric_is_rejected() {
807        let mut result = valid_validation_result();
808        result.aggregate_metrics = vec![MetricValue {
809            metric_name: metric_name("unknown"),
810            value: 1.0,
811        }];
812
813        assert!(
814            result
815                .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Validation)
816                .is_err()
817        );
818    }
819
820    /// Verifies that non finite metric value is rejected.
821    #[test]
822    fn non_finite_metric_value_is_rejected() {
823        let mut result = valid_validation_result();
824        result.aggregate_metrics = vec![MetricValue {
825            metric_name: metric_name("score"),
826            value: f64::NAN,
827        }];
828
829        assert!(result.validate_for_mode(ScoringMode::Validation).is_err());
830    }
831
832    /// Verifies that per run metrics are validated.
833    #[test]
834    fn per_run_metrics_are_validated() {
835        let mut result = valid_validation_result();
836        result.aggregate_metrics = vec![MetricValue {
837            metric_name: metric_name("score"),
838            value: 1.0,
839        }];
840        result.run_metrics = vec![RunMetricResult {
841            run_name: run_name("case-1"),
842            metrics: vec![MetricValue {
843                metric_name: metric_name("score"),
844                value: 1.0,
845            }],
846        }];
847
848        assert!(
849            result
850                .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Validation)
851                .is_ok()
852        );
853    }
854
855    /// Verifies that validation result rejects official only metrics.
856    #[test]
857    fn validation_result_rejects_official_only_metrics() {
858        let schema = MetricSchemaSpec {
859            metrics: vec![MetricDefinitionSpec {
860                name: metric_name("private_quality"),
861                label: "Private Quality".to_string(),
862                unit: None,
863                direction: MetricDirection::Maximize,
864                visibility: MetricVisibility::Official,
865                metric_description: None,
866            }],
867            ranking: RankingSpec {
868                primary_metric_name: metric_name("private_quality"),
869                tie_breaker_metric_names: vec![],
870            },
871        };
872        let mut result = valid_validation_result();
873        result.aggregate_metrics = vec![MetricValue {
874            metric_name: metric_name("private_quality"),
875            value: 0.9,
876        }];
877
878        assert!(
879            result
880                .complete_metric_result(&schema, ScoringMode::Validation)
881                .is_err()
882        );
883    }
884
885    /// Verifies completed official output must include the declared primary metric.
886    #[test]
887    fn official_result_requires_primary_metric() {
888        let mut result = valid_validation_result();
889        result.mode = Some(ScoringMode::Official);
890        result.validation_summary = None;
891        result.official_summary = Some(ScoreSummary {
892            score: 3.25,
893            passed: 1,
894            total: 1,
895        });
896
897        let error = result
898            .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Official)
899            .expect_err("official result should require primary aggregate metric");
900        assert!(error.contains("aggregate_metrics missing primary metric"));
901    }
902
903    /// Verifies summary and public case scores accept arbitrary finite values.
904    #[test]
905    fn summary_and_public_case_scores_accept_arbitrary_finite_values() {
906        let summary = ScoreSummary {
907            score: 42.0,
908            passed: 1,
909            total: 1,
910        };
911        assert!(summary.validate("validation_summary").is_ok());
912
913        let public_case = super::PublicCaseResult {
914            case_name: "case-1".to_string(),
915            status: EvaluatorCaseStatus::Passed,
916            score: -7.5,
917            message: None,
918        };
919        assert!(public_case.validate().is_ok());
920
921        let invalid_summary = ScoreSummary {
922            score: f64::INFINITY,
923            passed: 1,
924            total: 1,
925        };
926        assert!(invalid_summary.validate("validation_summary").is_err());
927
928        let invalid_public_case = super::PublicCaseResult {
929            case_name: "case-2".to_string(),
930            status: EvaluatorCaseStatus::Passed,
931            score: f64::NAN,
932            message: None,
933        };
934        assert!(invalid_public_case.validate().is_err());
935    }
936
937    /// Verifies platform size limits reject result payload expansion.
938    #[test]
939    fn evaluator_result_size_limits_are_enforced() {
940        let mut result = valid_validation_result();
941        result.public_results = vec![
942            super::PublicCaseResult {
943                case_name: "case-1".to_string(),
944                status: EvaluatorCaseStatus::Passed,
945                score: 1.0,
946                message: None,
947            },
948            super::PublicCaseResult {
949                case_name: "case-2".to_string(),
950                status: EvaluatorCaseStatus::Passed,
951                score: 1.0,
952                message: None,
953            },
954        ];
955
956        let public_result_error = result
957            .validate_size_limits(1, 1024)
958            .expect_err("public result count should be capped");
959        assert!(public_result_error.contains("public_results"));
960
961        let mut result = valid_validation_result();
962        result.logs = vec!["abcd".to_string(), "efgh".to_string()];
963
964        let log_error = result
965            .validate_size_limits(1024, 7)
966            .expect_err("embedded result logs should be capped");
967        assert!(log_error.contains("result.logs"));
968    }
969}
970
971/// Minimal job DTO returned when a solution submission queues an evaluation.
972#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
973pub struct EvaluationJobDto {
974    pub id: EvaluationJobId,
975    pub target: TargetName,
976    pub status: EvaluationJobStatus,
977}
978
979/// Immutable metadata measured from the submitted solution artifact.
980#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
981pub struct SolutionArtifactMetadata {
982    /// Compressed ZIP object size in bytes.
983    pub artifact_zip_bytes: u64,
984    /// Sum of expanded regular-file entry sizes in bytes.
985    pub artifact_uncompressed_bytes: u64,
986    /// Number of validated archive entries.
987    pub artifact_file_count: u64,
988    /// SHA-256 digest of the exact submitted ZIP bytes.
989    pub artifact_sha256: Sha256Digest,
990}
991
992/// Runner payload persisted on an evaluation job.
993#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
994pub struct EvaluationJobPayload {
995    pub artifact_key: StorageKey,
996    pub bundle_key: StorageKey,
997    pub public_bundle_key: StorageKey,
998    pub challenge_name: ChallengeName,
999    pub target: TargetName,
1000}
agentics_domain/models/evaluation.rs

agentics_domain/models/
evaluation.rs