1use serde::{Deserialize, Serialize};
15
16pub const TARGET_AVERAGE_SCORE: f64 = 1.5;
18
19pub const SCENARIO_COUNT: usize = 5;
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
28pub enum ScenarioId {
29 BasicLifecycle,
31 MultiFileEdit,
33 MultiAgent,
35 ConflictResolution,
37 ReadOnlyInspection,
39}
40
41#[derive(Debug, Clone, Serialize)]
46pub struct Scenario {
47 pub id: ScenarioId,
49 pub name: &'static str,
51 pub tests: &'static str,
53 pub preconditions: Preconditions,
55 pub task_prompt: &'static str,
58 pub expected_outcomes: &'static [&'static str],
60 pub expected_commands: &'static [&'static str],
63 pub optimal_command_count: u32,
65}
66
67#[derive(Debug, Clone, Serialize)]
69pub struct Preconditions {
70 pub repo_state: &'static str,
72 pub seed_files: &'static [SeedFile],
74 pub existing_workspaces: &'static [WorkspaceSetup],
76}
77
78#[derive(Debug, Clone, Serialize)]
80pub struct SeedFile {
81 pub path: &'static str,
83 pub content: &'static str,
85}
86
87#[derive(Debug, Clone, Serialize)]
89pub struct WorkspaceSetup {
90 pub name: &'static str,
92 pub files: &'static [SeedFile],
94}
95
96#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
104#[repr(u8)]
105pub enum FrictionScore {
106 Perfect = 1,
108 Minor = 2,
110 Moderate = 3,
112 Difficult = 4,
114 Failed = 5,
116}
117
118#[derive(Debug, Clone, Default, Serialize, Deserialize)]
120pub struct RunMetrics {
121 pub tool_calls: u32,
123 pub maw_commands: u32,
125 pub errors: u32,
127 pub retries: u32,
129 pub confusion_markers: u32,
131 pub goal_achieved: bool,
133 pub recovery_steps: u32,
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ScenarioResult {
140 pub scenario_id: ScenarioId,
142 pub metrics: RunMetrics,
144 pub score: FrictionScore,
146 pub passed: bool,
148}
149
150#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct EvalReport {
153 pub results: Vec<ScenarioResult>,
155 pub average_score: f64,
157 pub passed: bool,
159}
160
161impl RunMetrics {
162 #[must_use]
164 pub const fn score(&self) -> FrictionScore {
165 if !self.goal_achieved {
166 return FrictionScore::Failed;
167 }
168 if self.errors == 0 && self.retries == 0 && self.confusion_markers == 0 {
169 return FrictionScore::Perfect;
170 }
171 if self.errors <= 1 && self.retries <= 1 && self.confusion_markers <= 1 {
172 return FrictionScore::Minor;
173 }
174 if self.retries <= 3 && self.confusion_markers <= 3 {
175 return FrictionScore::Moderate;
176 }
177 FrictionScore::Difficult
178 }
179}
180
181impl FrictionScore {
182 #[must_use]
184 pub const fn value(self) -> u8 {
185 self as u8
186 }
187}
188
189impl EvalReport {
190 #[must_use]
192 pub fn from_results(results: Vec<ScenarioResult>) -> Self {
193 let sum: u32 = results.iter().map(|r| u32::from(r.score.value())).sum();
194 let count = u32::try_from(results.len().max(1)).unwrap_or(u32::MAX);
195 let average_score = f64::from(sum) / f64::from(count);
196 let passed = average_score <= TARGET_AVERAGE_SCORE;
197 Self {
198 results,
199 average_score,
200 passed,
201 }
202 }
203}
204
205#[must_use]
211pub const fn all_scenarios() -> [Scenario; SCENARIO_COUNT] {
212 [
213 scenario_basic_lifecycle(),
214 scenario_multi_file_edit(),
215 scenario_multi_agent(),
216 scenario_conflict_resolution(),
217 scenario_read_only_inspection(),
218 ]
219}
220
221#[must_use]
226pub const fn scenario_basic_lifecycle() -> Scenario {
227 Scenario {
228 id: ScenarioId::BasicLifecycle,
229 name: "basic-lifecycle",
230 tests: "Minimal lifecycle: create workspace, add file, merge, verify",
231 preconditions: Preconditions {
232 repo_state: "Fresh maw repo with a seed Rust project (Cargo.toml, src/main.rs, src/lib.rs)",
233 seed_files: &[
234 SeedFile {
235 path: "Cargo.toml",
236 content: concat!(
237 "[package]\n",
238 "name = \"agent-eval\"\n",
239 "version = \"0.1.0\"\n",
240 "edition = \"2021\"\n",
241 "\n",
242 "[dependencies]\n",
243 ),
244 },
245 SeedFile {
246 path: "src/main.rs",
247 content: "fn main() {\n println!(\"hello from eval\");\n}\n",
248 },
249 SeedFile {
250 path: "src/lib.rs",
251 content: "pub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
252 },
253 ],
254 existing_workspaces: &[],
255 },
256 task_prompt: concat!(
257 "You are working on a Rust project managed by maw.\n",
258 "\n",
259 "Task:\n",
260 "1. Create a workspace named \"agent-1\".\n",
261 "2. Add a new file src/hello.rs containing:\n",
262 " pub fn hello() -> &'static str { \"hello\" }\n",
263 "3. Merge workspace agent-1 back (destroy the workspace after merge).\n",
264 "4. Confirm that src/hello.rs exists in the main workspace (ws/default/).\n",
265 "\n",
266 "Use only maw commands and file operations. Do not use git directly.\n",
267 "Use absolute paths for all file operations.\n",
268 ),
269 expected_outcomes: &[
270 "src/hello.rs exists in ws/default/ with correct content",
271 "workspace agent-1 no longer exists (destroyed)",
272 "no git commands were used by the agent",
273 ],
274 expected_commands: &[
275 "maw ws create agent-1",
276 "write src/hello.rs (file operation)",
277 "maw ws merge agent-1 --destroy",
278 ],
279 optimal_command_count: 3,
280 }
281}
282
283#[must_use]
288pub const fn scenario_multi_file_edit() -> Scenario {
289 Scenario {
290 id: ScenarioId::MultiFileEdit,
291 name: "multi-file-edit",
292 tests: "Multiple file edits in one workspace: modify existing, add new, merge",
293 preconditions: Preconditions {
294 repo_state: "Rust project with src/main.rs, src/lib.rs, src/utils.rs",
295 seed_files: &[
296 SeedFile {
297 path: "Cargo.toml",
298 content: concat!(
299 "[package]\n",
300 "name = \"agent-eval\"\n",
301 "version = \"0.1.0\"\n",
302 "edition = \"2021\"\n",
303 "\n",
304 "[dependencies]\n",
305 ),
306 },
307 SeedFile {
308 path: "src/main.rs",
309 content: concat!(
310 "mod utils;\n",
311 "\n",
312 "fn main() {\n",
313 " let result = utils::format_greeting(\"world\");\n",
314 " println!(\"{result}\");\n",
315 "}\n",
316 ),
317 },
318 SeedFile {
319 path: "src/lib.rs",
320 content: "pub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
321 },
322 SeedFile {
323 path: "src/utils.rs",
324 content: concat!(
325 "pub fn format_greeting(name: &str) -> String {\n",
326 " format!(\"Hello, {name}!\")\n",
327 "}\n",
328 ),
329 },
330 ],
331 existing_workspaces: &[],
332 },
333 task_prompt: concat!(
334 "You are working on a Rust project managed by maw.\n",
335 "\n",
336 "Task:\n",
337 "1. Create a workspace named \"feature-work\".\n",
338 "2. In the workspace, make these changes:\n",
339 " a. Modify src/lib.rs: add a new function `pub fn multiply(a: i32, b: i32) -> i32 { a * b }`\n",
340 " b. Modify src/utils.rs: add a new function `pub fn format_farewell(name: &str) -> String { format!(\"Goodbye, {name}!\") }`\n",
341 " c. Add a new file src/config.rs with: `pub const VERSION: &str = \"1.0.0\";`\n",
342 "3. Merge workspace feature-work back (destroy after merge).\n",
343 "4. Confirm all three changes are present in the main workspace.\n",
344 "\n",
345 "Use only maw commands and file operations. Do not use git directly.\n",
346 "Use absolute paths for all file operations.\n",
347 ),
348 expected_outcomes: &[
349 "src/lib.rs contains multiply function in ws/default/",
350 "src/utils.rs contains format_farewell function in ws/default/",
351 "src/config.rs exists with VERSION constant in ws/default/",
352 "workspace feature-work no longer exists",
353 ],
354 expected_commands: &[
355 "maw ws create feature-work",
356 "edit src/lib.rs (add multiply)",
357 "edit src/utils.rs (add format_farewell)",
358 "write src/config.rs",
359 "maw ws merge feature-work --destroy",
360 ],
361 optimal_command_count: 5,
362 }
363}
364
365#[must_use]
371pub const fn scenario_multi_agent() -> Scenario {
372 Scenario {
373 id: ScenarioId::MultiAgent,
374 name: "multi-agent",
375 tests: "Two workspaces with non-overlapping edits, sequential merge",
376 preconditions: Preconditions {
377 repo_state: concat!(
378 "Rust project with src/auth.rs, src/api.rs. ",
379 "Workspace 'agent-1' already exists with modifications to src/auth.rs.",
380 ),
381 seed_files: &[
382 SeedFile {
383 path: "Cargo.toml",
384 content: concat!(
385 "[package]\n",
386 "name = \"agent-eval\"\n",
387 "version = \"0.1.0\"\n",
388 "edition = \"2021\"\n",
389 "\n",
390 "[dependencies]\n",
391 ),
392 },
393 SeedFile {
394 path: "src/main.rs",
395 content: "fn main() {}\n",
396 },
397 SeedFile {
398 path: "src/auth.rs",
399 content: concat!(
400 "pub fn authenticate(user: &str) -> bool {\n",
401 " user == \"admin\"\n",
402 "}\n",
403 ),
404 },
405 SeedFile {
406 path: "src/api.rs",
407 content: concat!(
408 "pub fn handle_request(path: &str) -> String {\n",
409 " format!(\"OK: {path}\")\n",
410 "}\n",
411 ),
412 },
413 ],
414 existing_workspaces: &[WorkspaceSetup {
415 name: "agent-1",
416 files: &[SeedFile {
417 path: "src/auth.rs",
418 content: concat!(
419 "pub fn authenticate(user: &str) -> bool {\n",
420 " user == \"admin\" || user == \"root\"\n",
421 "}\n",
422 "\n",
423 "pub fn is_admin(user: &str) -> bool {\n",
424 " user == \"admin\"\n",
425 "}\n",
426 ),
427 }],
428 }],
429 },
430 task_prompt: concat!(
431 "You are agent-2 working on a Rust project managed by maw.\n",
432 "Another agent (agent-1) has already made changes in a workspace named \"agent-1\".\n",
433 "Agent-1 modified src/auth.rs (you don't need to know the details).\n",
434 "\n",
435 "Task:\n",
436 "1. Create a workspace named \"agent-2\".\n",
437 "2. In your workspace, modify src/api.rs: add a new function\n",
438 " `pub fn handle_error(code: u16) -> String { format!(\"Error: {code}\") }`\n",
439 "3. Merge BOTH workspaces (agent-1 and agent-2) back, destroying them.\n",
440 "4. Confirm that both sets of changes are present in the main workspace:\n",
441 " - src/auth.rs should contain an is_admin function\n",
442 " - src/api.rs should contain a handle_error function\n",
443 "\n",
444 "Use only maw commands and file operations. Do not use git directly.\n",
445 "Use absolute paths for all file operations.\n",
446 ),
447 expected_outcomes: &[
448 "src/auth.rs contains is_admin function in ws/default/",
449 "src/api.rs contains handle_error function in ws/default/",
450 "workspace agent-1 no longer exists",
451 "workspace agent-2 no longer exists",
452 ],
453 expected_commands: &[
454 "maw ws create agent-2",
455 "edit src/api.rs (add handle_error)",
456 "maw ws merge agent-1 --destroy",
457 "maw ws merge agent-2 --destroy",
458 ],
459 optimal_command_count: 4,
460 }
461}
462
463#[must_use]
469pub const fn scenario_conflict_resolution() -> Scenario {
470 Scenario {
471 id: ScenarioId::ConflictResolution,
472 name: "conflict-resolution",
473 tests: "Same-file conflict: detect, inspect, resolve, merge",
474 preconditions: Preconditions {
475 repo_state: concat!(
476 "Rust project with src/lib.rs. Two workspaces (left, right) both modify src/lib.rs. ",
477 "Workspace 'left' changes the add function body. ",
478 "Workspace 'right' also changes the add function body differently.",
479 ),
480 seed_files: &[
481 SeedFile {
482 path: "Cargo.toml",
483 content: concat!(
484 "[package]\n",
485 "name = \"agent-eval\"\n",
486 "version = \"0.1.0\"\n",
487 "edition = \"2021\"\n",
488 "\n",
489 "[dependencies]\n",
490 ),
491 },
492 SeedFile {
493 path: "src/main.rs",
494 content: "fn main() {}\n",
495 },
496 SeedFile {
497 path: "src/lib.rs",
498 content: concat!(
499 "pub fn add(a: i32, b: i32) -> i32 {\n",
500 " a + b\n",
501 "}\n",
502 ),
503 },
504 ],
505 existing_workspaces: &[
506 WorkspaceSetup {
507 name: "left",
508 files: &[SeedFile {
509 path: "src/lib.rs",
510 content: concat!(
511 "pub fn add(a: i32, b: i32) -> i32 {\n",
512 " let result = a + b;\n",
513 " assert!(result >= a.min(b), \"overflow\");\n",
514 " result\n",
515 "}\n",
516 ),
517 }],
518 },
519 WorkspaceSetup {
520 name: "right",
521 files: &[SeedFile {
522 path: "src/lib.rs",
523 content: concat!(
524 "pub fn add(a: i32, b: i32) -> i32 {\n",
525 " a.checked_add(b).expect(\"overflow\")\n",
526 "}\n",
527 ),
528 }],
529 },
530 ],
531 },
532 task_prompt: concat!(
533 "You are working on a Rust project managed by maw.\n",
534 "Two workspaces ('left' and 'right') both modified src/lib.rs.\n",
535 "\n",
536 "Task:\n",
537 "1. Try merging workspace 'left' — this should succeed.\n",
538 "2. Try merging workspace 'right' — this may produce a conflict.\n",
539 "3. If there is a conflict:\n",
540 " a. Inspect the conflicted file to understand both sides.\n",
541 " b. Resolve the conflict by keeping the checked_add approach from 'right'\n",
542 " (it's the safer implementation).\n",
543 " c. Complete the merge.\n",
544 "4. Confirm src/lib.rs in the main workspace uses checked_add.\n",
545 "5. Destroy both workspaces if not already destroyed.\n",
546 "\n",
547 "Use only maw commands and file operations. Do not use git directly.\n",
548 "Use absolute paths for all file operations.\n",
549 ),
550 expected_outcomes: &[
551 "src/lib.rs in ws/default/ uses checked_add",
552 "workspace left no longer exists",
553 "workspace right no longer exists",
554 "conflict was detected and resolved (not silently dropped)",
555 ],
556 expected_commands: &[
557 "maw ws merge left --destroy",
558 "maw ws merge right (conflict detected)",
559 "read/inspect conflicted file",
560 "edit src/lib.rs (resolve conflict)",
561 "maw ws merge right --destroy (or complete merge)",
562 ],
563 optimal_command_count: 5,
564 }
565}
566
567#[must_use]
573pub const fn scenario_read_only_inspection() -> Scenario {
574 Scenario {
575 id: ScenarioId::ReadOnlyInspection,
576 name: "read-only-inspection",
577 tests: "Inspect workspace state: list, status, files — without modifying anything",
578 preconditions: Preconditions {
579 repo_state: concat!(
580 "Rust project with two active workspaces. ",
581 "Workspace 'alice' has modified src/lib.rs (dirty). ",
582 "Workspace 'bob' has no modifications (clean).",
583 ),
584 seed_files: &[
585 SeedFile {
586 path: "Cargo.toml",
587 content: concat!(
588 "[package]\n",
589 "name = \"agent-eval\"\n",
590 "version = \"0.1.0\"\n",
591 "edition = \"2021\"\n",
592 "\n",
593 "[dependencies]\n",
594 ),
595 },
596 SeedFile {
597 path: "src/main.rs",
598 content: "fn main() {}\n",
599 },
600 SeedFile {
601 path: "src/lib.rs",
602 content: "pub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
603 },
604 ],
605 existing_workspaces: &[
606 WorkspaceSetup {
607 name: "alice",
608 files: &[SeedFile {
609 path: "src/lib.rs",
610 content: concat!(
611 "pub fn add(a: i32, b: i32) -> i32 {\n",
612 " a + b\n",
613 "}\n",
614 "\n",
615 "pub fn subtract(a: i32, b: i32) -> i32 {\n",
616 " a - b\n",
617 "}\n",
618 ),
619 }],
620 },
621 WorkspaceSetup {
622 name: "bob",
623 files: &[],
624 },
625 ],
626 },
627 task_prompt: concat!(
628 "You are a lead agent inspecting the state of a Rust project managed by maw.\n",
629 "DO NOT modify any files or merge anything — this is read-only inspection.\n",
630 "\n",
631 "Task:\n",
632 "1. List all workspaces. Report how many exist and their names.\n",
633 "2. Check the status of workspace 'alice'. Report whether it has dirty files.\n",
634 "3. Check the status of workspace 'bob'. Report whether it has dirty files.\n",
635 "4. Read the file src/lib.rs in alice's workspace. Report what function(s) it contains.\n",
636 "5. Read the file src/lib.rs in bob's workspace. Report what function(s) it contains.\n",
637 "\n",
638 "Output your findings as a structured summary.\n",
639 "\n",
640 "Use only maw commands and file read operations. Do not use git directly.\n",
641 "Do NOT merge, create, or destroy any workspaces.\n",
642 ),
643 expected_outcomes: &[
644 "agent lists 3 workspaces: default, alice, bob",
645 "agent reports alice has dirty files (src/lib.rs modified)",
646 "agent reports bob has no dirty files (clean)",
647 "agent reports alice's lib.rs has add and subtract functions",
648 "agent reports bob's lib.rs has only the add function",
649 "no workspaces were created, destroyed, or merged",
650 ],
651 expected_commands: &[
652 "maw ws list",
653 "maw ws status alice",
654 "maw ws status bob",
655 "read ws/alice/src/lib.rs",
656 "read ws/bob/src/lib.rs",
657 ],
658 optimal_command_count: 5,
659 }
660}
661
662#[cfg(test)]
667#[allow(clippy::all, clippy::pedantic, clippy::nursery)]
668mod tests {
669 use super::*;
670
671 #[test]
672 fn all_scenarios_returns_five() {
673 assert_eq!(all_scenarios().len(), SCENARIO_COUNT);
674 }
675
676 #[test]
677 fn scenario_ids_are_unique() {
678 let scenarios = all_scenarios();
679 let mut ids: Vec<_> = scenarios.iter().map(|s| s.id).collect();
680 let original_len = ids.len();
681 ids.dedup();
682 assert_eq!(ids.len(), original_len, "duplicate scenario IDs detected");
683 }
684
685 #[test]
686 fn scenario_names_are_unique() {
687 let scenarios = all_scenarios();
688 let mut names: Vec<_> = scenarios.iter().map(|s| s.name).collect();
689 let original_len = names.len();
690 names.sort_unstable();
691 names.dedup();
692 assert_eq!(
693 names.len(),
694 original_len,
695 "duplicate scenario names detected"
696 );
697 }
698
699 #[test]
700 fn each_scenario_has_nonempty_fields() {
701 for s in &all_scenarios() {
702 assert!(!s.name.is_empty(), "{:?} has empty name", s.id);
703 assert!(
704 !s.tests.is_empty(),
705 "{:?} has empty tests description",
706 s.id
707 );
708 assert!(
709 !s.task_prompt.is_empty(),
710 "{:?} has empty task_prompt",
711 s.id
712 );
713 assert!(
714 !s.expected_outcomes.is_empty(),
715 "{:?} has no expected outcomes",
716 s.id
717 );
718 assert!(
719 !s.expected_commands.is_empty(),
720 "{:?} has no expected commands",
721 s.id
722 );
723 assert!(
724 s.optimal_command_count > 0,
725 "{:?} has zero optimal command count",
726 s.id
727 );
728 }
729 }
730
731 #[test]
732 fn each_scenario_has_seed_files() {
733 for s in &all_scenarios() {
734 assert!(
735 !s.preconditions.seed_files.is_empty(),
736 "{:?} has no seed files",
737 s.id
738 );
739 }
740 }
741
742 #[test]
743 fn conflict_scenario_has_two_workspaces() {
744 let conflict = scenario_conflict_resolution();
745 assert_eq!(
746 conflict.preconditions.existing_workspaces.len(),
747 2,
748 "conflict scenario must have exactly 2 pre-existing workspaces"
749 );
750 }
751
752 #[test]
753 fn read_only_scenario_has_two_workspaces() {
754 let readonly = scenario_read_only_inspection();
755 assert_eq!(
756 readonly.preconditions.existing_workspaces.len(),
757 2,
758 "read-only scenario must have exactly 2 pre-existing workspaces"
759 );
760 }
761
762 #[test]
763 fn conflict_and_readonly_scenarios_present() {
764 let scenarios = all_scenarios();
765 let has_conflict = scenarios
766 .iter()
767 .any(|s| s.id == ScenarioId::ConflictResolution);
768 let has_readonly = scenarios
769 .iter()
770 .any(|s| s.id == ScenarioId::ReadOnlyInspection);
771 assert!(has_conflict, "must have a conflict handling scenario");
772 assert!(has_readonly, "must have a read-only inspection scenario");
773 }
774
775 #[test]
776 fn task_prompts_do_not_mention_vcs() {
777 let forbidden = ["git ", "branch", "commit", "checkout", "rebase"];
778 for s in &all_scenarios() {
779 for word in &forbidden {
780 let prompt_lower = s.task_prompt.to_lowercase();
782 let occurrences: Vec<_> = prompt_lower.match_indices(word).collect();
783 for (idx, _) in &occurrences {
784 let start = idx.saturating_sub(20);
786 let context = &prompt_lower[start..prompt_lower.len().min(idx + 30)];
787 assert!(
788 context.contains("do not use") || context.contains("don't use"),
789 "{:?} task prompt mentions '{}' outside of prohibition context: ...{}...",
790 s.id,
791 word.trim(),
792 context,
793 );
794 }
795 }
796 }
797 }
798
799 #[test]
800 fn target_threshold_is_encoded() {
801 assert!(
802 TARGET_AVERAGE_SCORE > 0.0 && TARGET_AVERAGE_SCORE <= 5.0,
803 "target must be between 0 and 5"
804 );
805 assert!(
807 (TARGET_AVERAGE_SCORE - 1.5).abs() < f64::EPSILON,
808 "target must be 1.5 per bead spec"
809 );
810 }
811
812 #[test]
815 fn perfect_run_scores_1() {
816 let metrics = RunMetrics {
817 tool_calls: 5,
818 maw_commands: 3,
819 errors: 0,
820 retries: 0,
821 confusion_markers: 0,
822 goal_achieved: true,
823 recovery_steps: 0,
824 };
825 assert_eq!(metrics.score(), FrictionScore::Perfect);
826 assert_eq!(metrics.score().value(), 1);
827 }
828
829 #[test]
830 fn minor_error_scores_2() {
831 let metrics = RunMetrics {
832 tool_calls: 7,
833 maw_commands: 4,
834 errors: 1,
835 retries: 0,
836 confusion_markers: 0,
837 goal_achieved: true,
838 recovery_steps: 1,
839 };
840 assert_eq!(metrics.score(), FrictionScore::Minor);
841 assert_eq!(metrics.score().value(), 2);
842 }
843
844 #[test]
845 fn moderate_difficulty_scores_3() {
846 let metrics = RunMetrics {
847 tool_calls: 12,
848 maw_commands: 6,
849 errors: 2,
850 retries: 3,
851 confusion_markers: 2,
852 goal_achieved: true,
853 recovery_steps: 4,
854 };
855 assert_eq!(metrics.score(), FrictionScore::Moderate);
856 assert_eq!(metrics.score().value(), 3);
857 }
858
859 #[test]
860 fn difficult_scores_4() {
861 let metrics = RunMetrics {
862 tool_calls: 20,
863 maw_commands: 10,
864 errors: 5,
865 retries: 6,
866 confusion_markers: 4,
867 goal_achieved: true,
868 recovery_steps: 8,
869 };
870 assert_eq!(metrics.score(), FrictionScore::Difficult);
871 assert_eq!(metrics.score().value(), 4);
872 }
873
874 #[test]
875 fn failed_run_scores_5() {
876 let metrics = RunMetrics {
877 tool_calls: 15,
878 maw_commands: 8,
879 errors: 3,
880 retries: 2,
881 confusion_markers: 1,
882 goal_achieved: false,
883 recovery_steps: 5,
884 };
885 assert_eq!(metrics.score(), FrictionScore::Failed);
886 assert_eq!(metrics.score().value(), 5);
887 }
888
889 #[test]
890 fn eval_report_passes_below_threshold() {
891 let results = vec![
892 ScenarioResult {
893 scenario_id: ScenarioId::BasicLifecycle,
894 metrics: RunMetrics {
895 goal_achieved: true,
896 ..Default::default()
897 },
898 score: FrictionScore::Perfect,
899 passed: true,
900 },
901 ScenarioResult {
902 scenario_id: ScenarioId::MultiFileEdit,
903 metrics: RunMetrics {
904 goal_achieved: true,
905 errors: 1,
906 ..Default::default()
907 },
908 score: FrictionScore::Minor,
909 passed: true,
910 },
911 ];
912 let report = EvalReport::from_results(results);
913 assert_eq!(report.average_score, 1.5);
914 assert!(report.passed);
915 }
916
917 #[test]
918 fn eval_report_fails_above_threshold() {
919 let results = vec![
920 ScenarioResult {
921 scenario_id: ScenarioId::BasicLifecycle,
922 metrics: RunMetrics {
923 goal_achieved: true,
924 ..Default::default()
925 },
926 score: FrictionScore::Perfect,
927 passed: true,
928 },
929 ScenarioResult {
930 scenario_id: ScenarioId::ConflictResolution,
931 metrics: RunMetrics {
932 goal_achieved: true,
933 errors: 3,
934 retries: 4,
935 confusion_markers: 5,
936 ..Default::default()
937 },
938 score: FrictionScore::Difficult,
939 passed: false,
940 },
941 ];
942 let report = EvalReport::from_results(results);
943 assert_eq!(report.average_score, 2.5);
944 assert!(!report.passed);
945 }
946
947 #[test]
948 fn scenarios_serialize_to_json() {
949 let scenarios = all_scenarios();
950 for s in &scenarios {
951 let json = serde_json::to_string(s).expect("scenario should serialize");
952 assert!(!json.is_empty());
953 let value: serde_json::Value =
955 serde_json::from_str(&json).expect("should be valid JSON");
956 assert!(value.is_object(), "scenario JSON should be an object");
957 assert!(
958 value.get("id").is_some(),
959 "{:?} JSON missing 'id' field",
960 s.id
961 );
962 assert!(
963 value.get("task_prompt").is_some(),
964 "{:?} JSON missing 'task_prompt' field",
965 s.id
966 );
967 }
968 }
969
970 #[test]
971 fn scoring_rubric_is_monotonic() {
972 assert!(FrictionScore::Perfect < FrictionScore::Minor);
974 assert!(FrictionScore::Minor < FrictionScore::Moderate);
975 assert!(FrictionScore::Moderate < FrictionScore::Difficult);
976 assert!(FrictionScore::Difficult < FrictionScore::Failed);
977 }
978}