1use std::cmp::Ordering;
2use std::collections::HashSet;
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7use super::challenge::{MetricDirection, MetricSchemaSpec, MetricVisibility};
8use super::hashes::Sha256Digest;
9use super::ids::{EvaluationId, EvaluationJobId};
10use super::names::{ChallengeName, MetricName, RunName, TargetName};
11use crate::storage::StorageKey;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
15pub enum ScoringMode {
16 #[serde(rename = "validation")]
18 Validation,
19 #[serde(rename = "official")]
21 Official,
22}
23
24impl ScoringMode {
25 pub fn as_str(self) -> &'static str {
27 match self {
28 Self::Validation => "validation",
29 Self::Official => "official",
30 }
31 }
32
33 pub fn from_storage_value(value: &str) -> Option<Self> {
35 match value {
36 "validation" => Some(Self::Validation),
37 "official" => Some(Self::Official),
38 _ => None,
39 }
40 }
41
42 pub fn evaluator_mode_arg(self) -> &'static str {
44 match self {
45 Self::Validation => "validation",
46 Self::Official => "official",
47 }
48 }
49}
50
51impl fmt::Display for ScoringMode {
52 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54 f.write_str(self.as_str())
55 }
56}
57
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
60#[serde(rename_all = "snake_case")]
61pub enum ScoreVisibility {
62 Full,
63 ScoreOnly,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
68#[serde(rename_all = "snake_case")]
69pub enum EvaluatorCaseStatus {
70 Passed,
71 Failed,
72 Error,
73}
74
75#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
77#[serde(rename_all = "snake_case")]
78pub enum EvaluatorRunStatus {
79 Passed,
80 Failed,
81 Error,
82}
83
84#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
86#[serde(rename_all = "snake_case")]
87pub enum EvaluationStatus {
88 Queued,
89 Running,
90 Completed,
91 Failed,
92}
93
94impl EvaluationStatus {
95 pub fn as_str(self) -> &'static str {
97 match self {
98 Self::Queued => "queued",
99 Self::Running => "running",
100 Self::Completed => "completed",
101 Self::Failed => "failed",
102 }
103 }
104
105 pub fn from_storage_value(value: &str) -> Option<Self> {
107 match value {
108 "queued" => Some(Self::Queued),
109 "running" => Some(Self::Running),
110 "completed" => Some(Self::Completed),
111 "failed" => Some(Self::Failed),
112 _ => None,
113 }
114 }
115}
116
117impl fmt::Display for EvaluationStatus {
118 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
120 f.write_str(self.as_str())
121 }
122}
123
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
126#[serde(rename_all = "snake_case")]
127pub enum EvaluationJobStatus {
128 Staged,
129 Queued,
130 Running,
131 Completed,
132 Failed,
133}
134
135impl EvaluationJobStatus {
136 pub fn as_str(self) -> &'static str {
138 match self {
139 Self::Staged => "staged",
140 Self::Queued => "queued",
141 Self::Running => "running",
142 Self::Completed => "completed",
143 Self::Failed => "failed",
144 }
145 }
146
147 pub fn from_storage_value(value: &str) -> Option<Self> {
149 match value {
150 "staged" => Some(Self::Staged),
151 "queued" => Some(Self::Queued),
152 "running" => Some(Self::Running),
153 "completed" => Some(Self::Completed),
154 "failed" => Some(Self::Failed),
155 _ => None,
156 }
157 }
158}
159
160impl fmt::Display for EvaluationJobStatus {
161 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
163 f.write_str(self.as_str())
164 }
165}
166
167#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
169#[serde(rename_all = "snake_case")]
170pub enum SolutionSubmissionStatus {
171 Pending,
172 Queued,
173 Running,
174 Completed,
175 Failed,
176}
177
178impl SolutionSubmissionStatus {
179 pub fn as_str(self) -> &'static str {
181 match self {
182 Self::Pending => "pending",
183 Self::Queued => "queued",
184 Self::Running => "running",
185 Self::Completed => "completed",
186 Self::Failed => "failed",
187 }
188 }
189
190 pub fn from_storage_value(value: &str) -> Option<Self> {
192 match value {
193 "pending" => Some(Self::Pending),
194 "queued" => Some(Self::Queued),
195 "running" => Some(Self::Running),
196 "completed" => Some(Self::Completed),
197 "failed" => Some(Self::Failed),
198 _ => None,
199 }
200 }
201}
202
203impl fmt::Display for SolutionSubmissionStatus {
204 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
206 f.write_str(self.as_str())
207 }
208}
209
210#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
212pub struct ScoreSummary {
213 pub score: f64,
215 pub passed: i64,
217 pub total: i64,
219}
220
221#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
223pub struct PublicCaseResult {
224 pub case_name: String,
225 pub status: EvaluatorCaseStatus,
226 pub score: f64,
227 #[serde(skip_serializing_if = "Option::is_none")]
228 pub message: Option<String>,
229}
230
231#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
233pub struct MetricValue {
234 pub metric_name: MetricName,
235 pub value: f64,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
240pub struct RunMetricResult {
241 pub run_name: RunName,
242 #[serde(default)]
243 #[schemars(required)]
244 pub metrics: Vec<MetricValue>,
245}
246
247#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
249pub struct EvaluationDto {
250 pub id: EvaluationId,
251 pub target: TargetName,
252 pub status: EvaluationStatus,
253 pub eval_type: ScoringMode,
254 pub aggregate_metrics: Vec<MetricValue>,
255 pub run_metrics: Vec<RunMetricResult>,
256 pub public_results: Vec<PublicCaseResult>,
257 #[serde(skip_serializing_if = "Option::is_none")]
258 pub validation_summary: Option<ScoreSummary>,
259 #[serde(skip_serializing_if = "Option::is_none")]
260 pub official_summary: Option<ScoreSummary>,
261 #[serde(skip_serializing_if = "Option::is_none")]
262 pub runner_log_storage_key: Option<StorageKey>,
263 #[serde(skip_serializing_if = "Option::is_none")]
264 pub started_at: Option<String>,
265 #[serde(skip_serializing_if = "Option::is_none")]
266 pub finished_at: Option<String>,
267}
268
269#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
275#[serde(deny_unknown_fields)]
276pub struct EvaluatorRunResult {
277 pub status: EvaluatorRunStatus,
278 #[serde(skip_serializing_if = "Option::is_none")]
279 pub mode: Option<ScoringMode>,
280 #[serde(default)]
281 pub aggregate_metrics: Vec<MetricValue>,
282 #[serde(default)]
283 pub run_metrics: Vec<RunMetricResult>,
284 #[serde(default)]
285 pub public_results: Vec<PublicCaseResult>,
286 #[serde(default, skip_serializing_if = "Option::is_none")]
287 pub validation_summary: Option<ScoreSummary>,
288 #[serde(default, skip_serializing_if = "Option::is_none")]
289 pub official_summary: Option<ScoreSummary>,
290 #[serde(default)]
291 pub logs: Vec<String>,
292}
293
294impl ScoreSummary {
295 pub fn validate(&self, label: &str) -> Result<(), String> {
297 validate_finite_number(self.score, &format!("{label}.score"))?;
298 if self.passed < 0 {
299 return Err(format!("{label}.passed must be >= 0"));
300 }
301 if self.total < 0 {
302 return Err(format!("{label}.total must be >= 0"));
303 }
304 if self.passed > self.total {
305 return Err(format!("{label}.passed cannot be greater than total"));
306 }
307
308 Ok(())
309 }
310}
311
312impl PublicCaseResult {
313 pub fn validate(&self) -> Result<(), String> {
315 if self.case_name.trim().is_empty() {
316 return Err("public_results.case_name must not be empty".to_string());
317 }
318 validate_finite_number(self.score, "public_results.score")
319 }
320}
321
322impl MetricValue {
323 pub fn validate(&self, field: &str) -> Result<(), String> {
325 validate_finite_number(self.value, &format!("{field}.value"))
326 }
327
328 pub fn find_by_name(metrics: &[Self], metric_name: &MetricName) -> Option<Self> {
330 metrics
331 .iter()
332 .find(|metric| &metric.metric_name == metric_name)
333 .cloned()
334 }
335}
336
337impl RunMetricResult {
338 pub fn validate(&self) -> Result<(), String> {
340 let mut metric_names = HashSet::with_capacity(self.metrics.len());
341 for metric in &self.metrics {
342 metric.validate("run_metrics.metrics")?;
343 if !metric_names.insert(metric.metric_name.as_str()) {
344 return Err(format!(
345 "run_metrics.metrics contains duplicate metric_name `{}` for run `{}`",
346 metric.metric_name, self.run_name
347 ));
348 }
349 }
350
351 Ok(())
352 }
353}
354
355impl EvaluatorRunResult {
356 pub fn validate_size_limits(
358 &self,
359 max_public_results: u64,
360 max_result_log_bytes: u64,
361 ) -> Result<(), String> {
362 let public_result_count = u64::try_from(self.public_results.len())
363 .map_err(|_| "public_results count exceeds supported range".to_string())?;
364 if public_result_count > max_public_results {
365 return Err(format!(
366 "public_results contains too many entries: {public_result_count} > {max_public_results}"
367 ));
368 }
369
370 let mut log_bytes = 0u64;
371 for log in &self.logs {
372 let len = u64::try_from(log.len())
373 .map_err(|_| "result.logs byte length exceeds supported range".to_string())?;
374 log_bytes = log_bytes
375 .checked_add(len)
376 .ok_or_else(|| "result.logs byte length overflow".to_string())?;
377 if log_bytes > max_result_log_bytes {
378 return Err(format!(
379 "result.logs exceeds byte limit: {log_bytes} > {max_result_log_bytes} bytes"
380 ));
381 }
382 }
383
384 Ok(())
385 }
386
387 pub fn validate_for_mode(&self, mode: ScoringMode) -> Result<(), String> {
391 if let Some(result_mode) = self.mode
392 && result_mode != mode
393 {
394 return Err("result mode does not match evaluation job type".to_string());
395 }
396
397 validate_metric_values(&self.aggregate_metrics, "aggregate_metrics")?;
398
399 let mut run_names = HashSet::with_capacity(self.run_metrics.len());
400 for run in &self.run_metrics {
401 run.validate()?;
402 if !run_names.insert(run.run_name.as_str()) {
403 return Err(format!(
404 "run_metrics contains duplicate run_name `{}`",
405 run.run_name
406 ));
407 }
408 }
409
410 for public_result in &self.public_results {
411 public_result.validate()?;
412 }
413
414 if let Some(validation) = &self.validation_summary {
415 validation.validate("validation_summary")?;
416 }
417 if let Some(official) = &self.official_summary {
418 official.validate("official_summary")?;
419 }
420
421 if self.validation_summary.is_none() && self.official_summary.is_none() {
422 return Err(
423 "validation_summary and official_summary cannot both be absent".to_string(),
424 );
425 }
426 if mode == ScoringMode::Validation && self.validation_summary.is_none() {
427 return Err("validation evaluation requires validation_summary".to_string());
428 }
429 if mode == ScoringMode::Official && self.official_summary.is_none() {
430 return Err("official evaluation requires official_summary".to_string());
431 }
432
433 Ok(())
434 }
435
436 pub fn complete_metric_result(
438 &mut self,
439 schema: &MetricSchemaSpec,
440 mode: ScoringMode,
441 ) -> Result<(), String> {
442 self.validate_for_metric_schema(schema, mode)
443 }
444
445 pub fn validate_for_metric_schema(
447 &self,
448 schema: &MetricSchemaSpec,
449 mode: ScoringMode,
450 ) -> Result<(), String> {
451 let declared = schema
452 .metrics
453 .iter()
454 .map(|metric| (metric.name.as_str(), metric))
455 .collect::<std::collections::HashMap<_, _>>();
456 if declared.is_empty() {
457 return Err("metric schema must declare at least one metric".to_string());
458 }
459
460 for metric in &self.aggregate_metrics {
461 let Some(definition) = declared.get(metric.metric_name.as_str()) else {
462 return Err(format!(
463 "aggregate_metrics references unknown metric `{}`",
464 metric.metric_name
465 ));
466 };
467 validate_metric_visibility(mode, definition.visibility, &metric.metric_name)?;
468 }
469
470 for run in &self.run_metrics {
471 for metric in &run.metrics {
472 let Some(definition) = declared.get(metric.metric_name.as_str()) else {
473 return Err(format!(
474 "run_metrics references unknown metric `{}`",
475 metric.metric_name
476 ));
477 };
478 validate_metric_visibility(mode, definition.visibility, &metric.metric_name)?;
479 }
480 }
481
482 if mode == ScoringMode::Official
483 && !self
484 .aggregate_metrics
485 .iter()
486 .any(|metric| metric.metric_name == schema.ranking.primary_metric_name)
487 {
488 return Err(format!(
489 "aggregate_metrics missing primary metric `{}`",
490 schema.ranking.primary_metric_name
491 ));
492 }
493
494 Ok(())
495 }
496}
497
498pub fn compare_metric_payloads_by_ranking(
502 schema: &MetricSchemaSpec,
503 a_metrics: &[MetricValue],
504 b_metrics: &[MetricValue],
505) -> Ordering {
506 let Some(primary) = schema.primary_metric() else {
507 return Ordering::Equal;
508 };
509 let primary_order = compare_metric_by_direction(
510 primary.direction,
511 metric_value_by_name(a_metrics, &schema.ranking.primary_metric_name),
512 metric_value_by_name(b_metrics, &schema.ranking.primary_metric_name),
513 );
514 if primary_order != Ordering::Equal {
515 return primary_order;
516 }
517
518 for metric_name in &schema.ranking.tie_breaker_metric_names {
519 let Some(definition) = schema.metric(metric_name) else {
520 continue;
521 };
522 let ordering = compare_metric_by_direction(
523 definition.direction,
524 metric_value_by_name(a_metrics, metric_name),
525 metric_value_by_name(b_metrics, metric_name),
526 );
527 if ordering != Ordering::Equal {
528 return ordering;
529 }
530 }
531
532 Ordering::Equal
533}
534
535pub fn metric_value_by_name(metrics: &[MetricValue], metric_name: &MetricName) -> Option<f64> {
537 metrics
538 .iter()
539 .find(|metric| &metric.metric_name == metric_name)
540 .map(|metric| metric.value)
541}
542
543fn compare_metric_by_direction(
545 direction: MetricDirection,
546 a: Option<f64>,
547 b: Option<f64>,
548) -> Ordering {
549 match (a, b) {
550 (Some(a), Some(b)) => match direction {
551 MetricDirection::Maximize => compare_f64_desc(a, b),
552 MetricDirection::Minimize => compare_f64_asc(a, b),
553 },
554 (Some(_), None) => Ordering::Less,
555 (None, Some(_)) => Ordering::Greater,
556 (None, None) => Ordering::Equal,
557 }
558}
559
560fn compare_f64_desc(a: f64, b: f64) -> Ordering {
562 b.partial_cmp(&a).unwrap_or(Ordering::Equal)
563}
564
565fn compare_f64_asc(a: f64, b: f64) -> Ordering {
567 a.partial_cmp(&b).unwrap_or(Ordering::Equal)
568}
569
570fn validate_finite_number(value: f64, field: &str) -> Result<(), String> {
572 if !value.is_finite() {
573 return Err(format!("{field} must be finite"));
574 }
575
576 Ok(())
577}
578
579fn validate_metric_values(metrics: &[MetricValue], field: &str) -> Result<(), String> {
581 let mut metric_names = HashSet::with_capacity(metrics.len());
582 for metric in metrics {
583 metric.validate(field)?;
584 if !metric_names.insert(metric.metric_name.as_str()) {
585 return Err(format!(
586 "{field} contains duplicate metric_name `{}`",
587 metric.metric_name
588 ));
589 }
590 }
591
592 Ok(())
593}
594
595fn validate_metric_visibility(
597 mode: ScoringMode,
598 visibility: MetricVisibility,
599 metric_name: &MetricName,
600) -> Result<(), String> {
601 if mode == ScoringMode::Validation && visibility == MetricVisibility::Official {
602 return Err(format!(
603 "validation results cannot include official-only metric `{metric_name}`"
604 ));
605 }
606
607 Ok(())
608}
609
610#[cfg(test)]
611mod tests {
612 use crate::models::challenge::{
613 MetricDefinitionSpec, MetricDirection, MetricSchemaSpec, MetricVisibility, RankingSpec,
614 };
615 use crate::models::names::{MetricName, RunName};
616
617 use super::{
618 EvaluatorCaseStatus, EvaluatorRunResult, EvaluatorRunStatus, MetricValue, RunMetricResult,
619 ScoreSummary, ScoringMode,
620 };
621
622 fn metric_name(value: &str) -> MetricName {
624 MetricName::try_new(value.to_string()).expect("test metric name is valid")
625 }
626
627 fn run_name(value: &str) -> RunName {
629 RunName::try_new(value.to_string()).expect("test run name is valid")
630 }
631
632 fn valid_validation_result() -> EvaluatorRunResult {
634 EvaluatorRunResult {
635 status: EvaluatorRunStatus::Passed,
636 mode: Some(ScoringMode::Validation),
637 aggregate_metrics: vec![],
638 run_metrics: vec![],
639 public_results: vec![],
640 validation_summary: Some(ScoreSummary {
641 score: 1.0,
642 passed: 1,
643 total: 1,
644 }),
645 official_summary: None,
646 logs: vec![],
647 }
648 }
649
650 #[test]
652 fn evaluator_mode_mismatch_is_rejected() {
653 let mut result = valid_validation_result();
654 result.mode = Some(ScoringMode::Official);
655 result.official_summary = Some(ScoreSummary {
656 score: 1.0,
657 passed: 1,
658 total: 1,
659 });
660
661 assert!(result.validate_for_mode(ScoringMode::Validation).is_err());
662 }
663
664 #[test]
666 fn evaluator_mode_can_be_absent() {
667 let mut result = valid_validation_result();
668 result.mode = None;
669
670 assert!(result.validate_for_mode(ScoringMode::Validation).is_ok());
671 }
672
673 #[test]
675 fn evaluator_output_with_declared_metrics_is_valid() {
676 let mut result = valid_validation_result();
677 result
678 .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Validation)
679 .unwrap();
680
681 assert!(result.aggregate_metrics.is_empty());
682 }
683
684 #[test]
686 fn minimized_primary_metric_ranks_smaller_values_first() {
687 let schema = MetricSchemaSpec {
688 metrics: vec![MetricDefinitionSpec {
689 name: metric_name("latency_ms"),
690 label: "Latency".to_string(),
691 unit: Some("ms".to_string()),
692 direction: MetricDirection::Minimize,
693 visibility: MetricVisibility::Public,
694 metric_description: None,
695 }],
696 ranking: RankingSpec {
697 primary_metric_name: metric_name("latency_ms"),
698 tie_breaker_metric_names: vec![],
699 },
700 };
701 let faster = vec![MetricValue {
702 metric_name: metric_name("latency_ms"),
703 value: 7.0,
704 }];
705 let slower = vec![MetricValue {
706 metric_name: metric_name("latency_ms"),
707 value: 42.0,
708 }];
709
710 assert_eq!(
711 super::compare_metric_payloads_by_ranking(&schema, &faster, &slower),
712 std::cmp::Ordering::Less
713 );
714 }
715
716 #[test]
718 fn maximized_primary_metric_ranks_larger_values_first() {
719 let schema = MetricSchemaSpec::default();
720 let better = vec![MetricValue {
721 metric_name: metric_name("score"),
722 value: 42.0,
723 }];
724 let worse = vec![MetricValue {
725 metric_name: metric_name("score"),
726 value: 7.0,
727 }];
728
729 assert_eq!(
730 super::compare_metric_payloads_by_ranking(&schema, &better, &worse),
731 std::cmp::Ordering::Less
732 );
733 }
734
735 #[test]
737 fn ranking_uses_declared_tie_breakers() {
738 let schema = MetricSchemaSpec {
739 metrics: vec![
740 MetricDefinitionSpec {
741 name: metric_name("score"),
742 label: "Score".to_string(),
743 unit: None,
744 direction: MetricDirection::Maximize,
745 visibility: MetricVisibility::Public,
746 metric_description: None,
747 },
748 MetricDefinitionSpec {
749 name: metric_name("passed_cases"),
750 label: "Passed Cases".to_string(),
751 unit: Some("cases".to_string()),
752 direction: MetricDirection::Maximize,
753 visibility: MetricVisibility::Public,
754 metric_description: None,
755 },
756 ],
757 ranking: RankingSpec {
758 primary_metric_name: metric_name("score"),
759 tie_breaker_metric_names: vec![metric_name("passed_cases")],
760 },
761 };
762 let better = vec![
763 MetricValue {
764 metric_name: metric_name("score"),
765 value: 1.0,
766 },
767 MetricValue {
768 metric_name: metric_name("passed_cases"),
769 value: 3.0,
770 },
771 ];
772 let worse = vec![
773 MetricValue {
774 metric_name: metric_name("score"),
775 value: 1.0,
776 },
777 MetricValue {
778 metric_name: metric_name("passed_cases"),
779 value: 1.0,
780 },
781 ];
782
783 assert_eq!(
784 super::compare_metric_payloads_by_ranking(&schema, &better, &worse),
785 std::cmp::Ordering::Less
786 );
787 }
788
789 #[test]
791 fn evaluator_result_rejects_rank_score_field() {
792 let raw = serde_json::json!({
793 "status": "passed",
794 "mode": "validation",
795 "rank_score": 1.0,
796 "validation_summary": { "score": 1.0, "passed": 1, "total": 1 }
797 });
798
799 let error = serde_json::from_value::<EvaluatorRunResult>(raw)
800 .expect_err("rank_score should be rejected as an unknown field");
801 assert!(error.to_string().contains("rank_score"));
802 }
803
804 #[test]
806 fn unknown_aggregate_metric_is_rejected() {
807 let mut result = valid_validation_result();
808 result.aggregate_metrics = vec![MetricValue {
809 metric_name: metric_name("unknown"),
810 value: 1.0,
811 }];
812
813 assert!(
814 result
815 .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Validation)
816 .is_err()
817 );
818 }
819
820 #[test]
822 fn non_finite_metric_value_is_rejected() {
823 let mut result = valid_validation_result();
824 result.aggregate_metrics = vec![MetricValue {
825 metric_name: metric_name("score"),
826 value: f64::NAN,
827 }];
828
829 assert!(result.validate_for_mode(ScoringMode::Validation).is_err());
830 }
831
832 #[test]
834 fn per_run_metrics_are_validated() {
835 let mut result = valid_validation_result();
836 result.aggregate_metrics = vec![MetricValue {
837 metric_name: metric_name("score"),
838 value: 1.0,
839 }];
840 result.run_metrics = vec![RunMetricResult {
841 run_name: run_name("case-1"),
842 metrics: vec![MetricValue {
843 metric_name: metric_name("score"),
844 value: 1.0,
845 }],
846 }];
847
848 assert!(
849 result
850 .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Validation)
851 .is_ok()
852 );
853 }
854
855 #[test]
857 fn validation_result_rejects_official_only_metrics() {
858 let schema = MetricSchemaSpec {
859 metrics: vec![MetricDefinitionSpec {
860 name: metric_name("private_quality"),
861 label: "Private Quality".to_string(),
862 unit: None,
863 direction: MetricDirection::Maximize,
864 visibility: MetricVisibility::Official,
865 metric_description: None,
866 }],
867 ranking: RankingSpec {
868 primary_metric_name: metric_name("private_quality"),
869 tie_breaker_metric_names: vec![],
870 },
871 };
872 let mut result = valid_validation_result();
873 result.aggregate_metrics = vec![MetricValue {
874 metric_name: metric_name("private_quality"),
875 value: 0.9,
876 }];
877
878 assert!(
879 result
880 .complete_metric_result(&schema, ScoringMode::Validation)
881 .is_err()
882 );
883 }
884
885 #[test]
887 fn official_result_requires_primary_metric() {
888 let mut result = valid_validation_result();
889 result.mode = Some(ScoringMode::Official);
890 result.validation_summary = None;
891 result.official_summary = Some(ScoreSummary {
892 score: 3.25,
893 passed: 1,
894 total: 1,
895 });
896
897 let error = result
898 .complete_metric_result(&MetricSchemaSpec::default(), ScoringMode::Official)
899 .expect_err("official result should require primary aggregate metric");
900 assert!(error.contains("aggregate_metrics missing primary metric"));
901 }
902
903 #[test]
905 fn summary_and_public_case_scores_accept_arbitrary_finite_values() {
906 let summary = ScoreSummary {
907 score: 42.0,
908 passed: 1,
909 total: 1,
910 };
911 assert!(summary.validate("validation_summary").is_ok());
912
913 let public_case = super::PublicCaseResult {
914 case_name: "case-1".to_string(),
915 status: EvaluatorCaseStatus::Passed,
916 score: -7.5,
917 message: None,
918 };
919 assert!(public_case.validate().is_ok());
920
921 let invalid_summary = ScoreSummary {
922 score: f64::INFINITY,
923 passed: 1,
924 total: 1,
925 };
926 assert!(invalid_summary.validate("validation_summary").is_err());
927
928 let invalid_public_case = super::PublicCaseResult {
929 case_name: "case-2".to_string(),
930 status: EvaluatorCaseStatus::Passed,
931 score: f64::NAN,
932 message: None,
933 };
934 assert!(invalid_public_case.validate().is_err());
935 }
936
937 #[test]
939 fn evaluator_result_size_limits_are_enforced() {
940 let mut result = valid_validation_result();
941 result.public_results = vec![
942 super::PublicCaseResult {
943 case_name: "case-1".to_string(),
944 status: EvaluatorCaseStatus::Passed,
945 score: 1.0,
946 message: None,
947 },
948 super::PublicCaseResult {
949 case_name: "case-2".to_string(),
950 status: EvaluatorCaseStatus::Passed,
951 score: 1.0,
952 message: None,
953 },
954 ];
955
956 let public_result_error = result
957 .validate_size_limits(1, 1024)
958 .expect_err("public result count should be capped");
959 assert!(public_result_error.contains("public_results"));
960
961 let mut result = valid_validation_result();
962 result.logs = vec!["abcd".to_string(), "efgh".to_string()];
963
964 let log_error = result
965 .validate_size_limits(1024, 7)
966 .expect_err("embedded result logs should be capped");
967 assert!(log_error.contains("result.logs"));
968 }
969}
970
971#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
973pub struct EvaluationJobDto {
974 pub id: EvaluationJobId,
975 pub target: TargetName,
976 pub status: EvaluationJobStatus,
977}
978
979#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
981pub struct SolutionArtifactMetadata {
982 pub artifact_zip_bytes: u64,
984 pub artifact_uncompressed_bytes: u64,
986 pub artifact_file_count: u64,
988 pub artifact_sha256: Sha256Digest,
990}
991
992#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
994pub struct EvaluationJobPayload {
995 pub artifact_key: StorageKey,
996 pub bundle_key: StorageKey,
997 pub public_bundle_key: StorageKey,
998 pub challenge_name: ChallengeName,
999 pub target: TargetName,
1000}