Skip to main content

maw/eval/
scenarios.rs

1//! Agent task scenarios and scoring rubric for UX friction evaluation.
2//!
3//! Each scenario tests a specific aspect of the agent experience with maw.
4//! Scenarios are designed to be run by real Claude agents against /tmp repos
5//! with no prior git knowledge — only directories, files, and JSON output.
6//!
7//! # Design Principles
8//!
9//! - **Non-leading prompts**: task descriptions tell agents *what* to do, not *how*.
10//! - **Objective scoring**: rubric uses observable metrics (errors, retries, confusion).
11//! - **Reproducible**: each scenario creates a deterministic initial state.
12//! - **Zero VCS knowledge**: agents never need to understand git or VCS concepts.
13
14use serde::{Deserialize, Serialize};
15
16/// Target: average friction score across all scenarios must be ≤ this threshold.
17pub const TARGET_AVERAGE_SCORE: f64 = 1.5;
18
19/// Total number of defined scenarios.
20pub const SCENARIO_COUNT: usize = 5;
21
22// ---------------------------------------------------------------------------
23// Core types
24// ---------------------------------------------------------------------------
25
26/// Unique identifier for a scenario.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
28pub enum ScenarioId {
29    /// S1: basic single-agent lifecycle.
30    BasicLifecycle,
31    /// S2: multi-file edit in a single workspace.
32    MultiFileEdit,
33    /// S3: two-agent coordination (parallel workspaces, sequential merge).
34    MultiAgent,
35    /// S4: conflict detection and resolution.
36    ConflictResolution,
37    /// S5: read-only inspection (list, status, history).
38    ReadOnlyInspection,
39}
40
41/// A complete scenario definition.
42///
43/// Uses `&'static` references for zero-allocation static definitions.
44/// Only `Serialize` is derived — scenarios are defined in code, not loaded from files.
45#[derive(Debug, Clone, Serialize)]
46pub struct Scenario {
47    /// Machine-readable id.
48    pub id: ScenarioId,
49    /// Human-readable short name.
50    pub name: &'static str,
51    /// What aspect of agent UX this scenario tests.
52    pub tests: &'static str,
53    /// Repository state that must exist before the agent starts.
54    pub preconditions: Preconditions,
55    /// The plain-English task prompt given to the agent.
56    /// Must NOT mention git, branches, commits, or VCS concepts.
57    pub task_prompt: &'static str,
58    /// Observable outcomes that determine success.
59    pub expected_outcomes: &'static [&'static str],
60    /// Maw commands the agent is expected to use (for scoring reference,
61    /// not shown to the agent).
62    pub expected_commands: &'static [&'static str],
63    /// Maximum number of maw commands an expert would need.
64    pub optimal_command_count: u32,
65}
66
67/// Preconditions describe the repo state before the scenario starts.
68#[derive(Debug, Clone, Serialize)]
69pub struct Preconditions {
70    /// Description of the initial repository state.
71    pub repo_state: &'static str,
72    /// Files that must exist in ws/default/ before the agent starts.
73    pub seed_files: &'static [SeedFile],
74    /// Workspaces that must exist (beyond default).
75    pub existing_workspaces: &'static [WorkspaceSetup],
76}
77
78/// A file to seed into the repo before the scenario starts.
79#[derive(Debug, Clone, Serialize)]
80pub struct SeedFile {
81    /// Path relative to workspace root (e.g., "src/main.rs").
82    pub path: &'static str,
83    /// File contents.
84    pub content: &'static str,
85}
86
87/// A workspace to pre-create with optional file modifications.
88#[derive(Debug, Clone, Serialize)]
89pub struct WorkspaceSetup {
90    /// Workspace name.
91    pub name: &'static str,
92    /// Files to create or modify in this workspace.
93    pub files: &'static [SeedFile],
94}
95
96// ---------------------------------------------------------------------------
97// Scoring
98// ---------------------------------------------------------------------------
99
100/// Friction score on a 1–5 scale.
101///
102/// Scoring is based on *observable agent behavior*, not subjective judgment.
103#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
104#[repr(u8)]
105pub enum FrictionScore {
106    /// Completed on first try with zero errors or retries.
107    Perfect = 1,
108    /// One recoverable error, self-corrected within 1 retry.
109    Minor = 2,
110    /// 2–3 retries or workarounds needed.
111    Moderate = 3,
112    /// Extensive trial-and-error (4+ retries or confusion markers).
113    Difficult = 4,
114    /// Could not complete the task.
115    Failed = 5,
116}
117
118/// Raw metrics collected from an agent transcript.
119#[derive(Debug, Clone, Default, Serialize, Deserialize)]
120pub struct RunMetrics {
121    /// Total tool invocations (Bash, Read, Write, Edit, etc.).
122    pub tool_calls: u32,
123    /// Maw commands specifically (subset of `tool_calls`).
124    pub maw_commands: u32,
125    /// Commands that returned non-zero exit codes.
126    pub errors: u32,
127    /// Repeated identical commands (sign of confusion).
128    pub retries: u32,
129    /// Confusion markers: "not sure", "let me try again", backtracking, etc.
130    pub confusion_markers: u32,
131    /// Whether the agent achieved all expected outcomes.
132    pub goal_achieved: bool,
133    /// Commands spent on recovery (after first error, not forward progress).
134    pub recovery_steps: u32,
135}
136
137/// Result of scoring a single scenario run.
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ScenarioResult {
140    /// Which scenario was run.
141    pub scenario_id: ScenarioId,
142    /// Raw metrics from the run.
143    pub metrics: RunMetrics,
144    /// Computed friction score.
145    pub score: FrictionScore,
146    /// Whether this scenario passed (score ≤ 2).
147    pub passed: bool,
148}
149
150/// Aggregate result across all scenarios.
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct EvalReport {
153    /// Individual scenario results.
154    pub results: Vec<ScenarioResult>,
155    /// Average friction score across all scenarios.
156    pub average_score: f64,
157    /// Whether the overall eval passed (average ≤ `TARGET_AVERAGE_SCORE`).
158    pub passed: bool,
159}
160
161impl RunMetrics {
162    /// Compute the friction score from raw metrics.
163    #[must_use]
164    pub const fn score(&self) -> FrictionScore {
165        if !self.goal_achieved {
166            return FrictionScore::Failed;
167        }
168        if self.errors == 0 && self.retries == 0 && self.confusion_markers == 0 {
169            return FrictionScore::Perfect;
170        }
171        if self.errors <= 1 && self.retries <= 1 && self.confusion_markers <= 1 {
172            return FrictionScore::Minor;
173        }
174        if self.retries <= 3 && self.confusion_markers <= 3 {
175            return FrictionScore::Moderate;
176        }
177        FrictionScore::Difficult
178    }
179}
180
181impl FrictionScore {
182    /// Numeric value (1–5) for averaging.
183    #[must_use]
184    pub const fn value(self) -> u8 {
185        self as u8
186    }
187}
188
189impl EvalReport {
190    /// Build an eval report from individual scenario results.
191    #[must_use]
192    pub fn from_results(results: Vec<ScenarioResult>) -> Self {
193        let sum: u32 = results.iter().map(|r| u32::from(r.score.value())).sum();
194        let count = u32::try_from(results.len().max(1)).unwrap_or(u32::MAX);
195        let average_score = f64::from(sum) / f64::from(count);
196        let passed = average_score <= TARGET_AVERAGE_SCORE;
197        Self {
198            results,
199            average_score,
200            passed,
201        }
202    }
203}
204
205// ---------------------------------------------------------------------------
206// Scenario definitions
207// ---------------------------------------------------------------------------
208
209/// Return all 5 defined scenarios.
210#[must_use]
211pub const fn all_scenarios() -> [Scenario; SCENARIO_COUNT] {
212    [
213        scenario_basic_lifecycle(),
214        scenario_multi_file_edit(),
215        scenario_multi_agent(),
216        scenario_conflict_resolution(),
217        scenario_read_only_inspection(),
218    ]
219}
220
221/// S1: Basic single-agent lifecycle.
222///
223/// Tests the minimal happy path: create workspace, add a file, merge.
224/// This is the simplest possible agent interaction with maw.
225#[must_use]
226pub const fn scenario_basic_lifecycle() -> Scenario {
227    Scenario {
228        id: ScenarioId::BasicLifecycle,
229        name: "basic-lifecycle",
230        tests: "Minimal lifecycle: create workspace, add file, merge, verify",
231        preconditions: Preconditions {
232            repo_state: "Fresh maw repo with a seed Rust project (Cargo.toml, src/main.rs, src/lib.rs)",
233            seed_files: &[
234                SeedFile {
235                    path: "Cargo.toml",
236                    content: concat!(
237                        "[package]\n",
238                        "name = \"agent-eval\"\n",
239                        "version = \"0.1.0\"\n",
240                        "edition = \"2021\"\n",
241                        "\n",
242                        "[dependencies]\n",
243                    ),
244                },
245                SeedFile {
246                    path: "src/main.rs",
247                    content: "fn main() {\n    println!(\"hello from eval\");\n}\n",
248                },
249                SeedFile {
250                    path: "src/lib.rs",
251                    content: "pub fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n",
252                },
253            ],
254            existing_workspaces: &[],
255        },
256        task_prompt: concat!(
257            "You are working on a Rust project managed by maw.\n",
258            "\n",
259            "Task:\n",
260            "1. Create a workspace named \"agent-1\".\n",
261            "2. Add a new file src/hello.rs containing:\n",
262            "     pub fn hello() -> &'static str { \"hello\" }\n",
263            "3. Merge workspace agent-1 back (destroy the workspace after merge).\n",
264            "4. Confirm that src/hello.rs exists in the main workspace (ws/default/).\n",
265            "\n",
266            "Use only maw commands and file operations. Do not use git directly.\n",
267            "Use absolute paths for all file operations.\n",
268        ),
269        expected_outcomes: &[
270            "src/hello.rs exists in ws/default/ with correct content",
271            "workspace agent-1 no longer exists (destroyed)",
272            "no git commands were used by the agent",
273        ],
274        expected_commands: &[
275            "maw ws create agent-1",
276            "write src/hello.rs (file operation)",
277            "maw ws merge agent-1 --destroy",
278        ],
279        optimal_command_count: 3,
280    }
281}
282
283/// S2: Multi-file edit in a single workspace.
284///
285/// Tests editing multiple existing files and adding new ones, then merging.
286/// More realistic than S1 — agents commonly modify several files per task.
287#[must_use]
288pub const fn scenario_multi_file_edit() -> Scenario {
289    Scenario {
290        id: ScenarioId::MultiFileEdit,
291        name: "multi-file-edit",
292        tests: "Multiple file edits in one workspace: modify existing, add new, merge",
293        preconditions: Preconditions {
294            repo_state: "Rust project with src/main.rs, src/lib.rs, src/utils.rs",
295            seed_files: &[
296                SeedFile {
297                    path: "Cargo.toml",
298                    content: concat!(
299                        "[package]\n",
300                        "name = \"agent-eval\"\n",
301                        "version = \"0.1.0\"\n",
302                        "edition = \"2021\"\n",
303                        "\n",
304                        "[dependencies]\n",
305                    ),
306                },
307                SeedFile {
308                    path: "src/main.rs",
309                    content: concat!(
310                        "mod utils;\n",
311                        "\n",
312                        "fn main() {\n",
313                        "    let result = utils::format_greeting(\"world\");\n",
314                        "    println!(\"{result}\");\n",
315                        "}\n",
316                    ),
317                },
318                SeedFile {
319                    path: "src/lib.rs",
320                    content: "pub fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n",
321                },
322                SeedFile {
323                    path: "src/utils.rs",
324                    content: concat!(
325                        "pub fn format_greeting(name: &str) -> String {\n",
326                        "    format!(\"Hello, {name}!\")\n",
327                        "}\n",
328                    ),
329                },
330            ],
331            existing_workspaces: &[],
332        },
333        task_prompt: concat!(
334            "You are working on a Rust project managed by maw.\n",
335            "\n",
336            "Task:\n",
337            "1. Create a workspace named \"feature-work\".\n",
338            "2. In the workspace, make these changes:\n",
339            "   a. Modify src/lib.rs: add a new function `pub fn multiply(a: i32, b: i32) -> i32 { a * b }`\n",
340            "   b. Modify src/utils.rs: add a new function `pub fn format_farewell(name: &str) -> String { format!(\"Goodbye, {name}!\") }`\n",
341            "   c. Add a new file src/config.rs with: `pub const VERSION: &str = \"1.0.0\";`\n",
342            "3. Merge workspace feature-work back (destroy after merge).\n",
343            "4. Confirm all three changes are present in the main workspace.\n",
344            "\n",
345            "Use only maw commands and file operations. Do not use git directly.\n",
346            "Use absolute paths for all file operations.\n",
347        ),
348        expected_outcomes: &[
349            "src/lib.rs contains multiply function in ws/default/",
350            "src/utils.rs contains format_farewell function in ws/default/",
351            "src/config.rs exists with VERSION constant in ws/default/",
352            "workspace feature-work no longer exists",
353        ],
354        expected_commands: &[
355            "maw ws create feature-work",
356            "edit src/lib.rs (add multiply)",
357            "edit src/utils.rs (add format_farewell)",
358            "write src/config.rs",
359            "maw ws merge feature-work --destroy",
360        ],
361        optimal_command_count: 5,
362    }
363}
364
365/// S3: Two-agent coordination.
366///
367/// Tests the multi-workspace workflow: one workspace already has changes,
368/// agent creates a second workspace, edits different files, and merges both.
369/// This validates that agents can reason about parallel workspaces.
370#[must_use]
371pub const fn scenario_multi_agent() -> Scenario {
372    Scenario {
373        id: ScenarioId::MultiAgent,
374        name: "multi-agent",
375        tests: "Two workspaces with non-overlapping edits, sequential merge",
376        preconditions: Preconditions {
377            repo_state: concat!(
378                "Rust project with src/auth.rs, src/api.rs. ",
379                "Workspace 'agent-1' already exists with modifications to src/auth.rs.",
380            ),
381            seed_files: &[
382                SeedFile {
383                    path: "Cargo.toml",
384                    content: concat!(
385                        "[package]\n",
386                        "name = \"agent-eval\"\n",
387                        "version = \"0.1.0\"\n",
388                        "edition = \"2021\"\n",
389                        "\n",
390                        "[dependencies]\n",
391                    ),
392                },
393                SeedFile {
394                    path: "src/main.rs",
395                    content: "fn main() {}\n",
396                },
397                SeedFile {
398                    path: "src/auth.rs",
399                    content: concat!(
400                        "pub fn authenticate(user: &str) -> bool {\n",
401                        "    user == \"admin\"\n",
402                        "}\n",
403                    ),
404                },
405                SeedFile {
406                    path: "src/api.rs",
407                    content: concat!(
408                        "pub fn handle_request(path: &str) -> String {\n",
409                        "    format!(\"OK: {path}\")\n",
410                        "}\n",
411                    ),
412                },
413            ],
414            existing_workspaces: &[WorkspaceSetup {
415                name: "agent-1",
416                files: &[SeedFile {
417                    path: "src/auth.rs",
418                    content: concat!(
419                        "pub fn authenticate(user: &str) -> bool {\n",
420                        "    user == \"admin\" || user == \"root\"\n",
421                        "}\n",
422                        "\n",
423                        "pub fn is_admin(user: &str) -> bool {\n",
424                        "    user == \"admin\"\n",
425                        "}\n",
426                    ),
427                }],
428            }],
429        },
430        task_prompt: concat!(
431            "You are agent-2 working on a Rust project managed by maw.\n",
432            "Another agent (agent-1) has already made changes in a workspace named \"agent-1\".\n",
433            "Agent-1 modified src/auth.rs (you don't need to know the details).\n",
434            "\n",
435            "Task:\n",
436            "1. Create a workspace named \"agent-2\".\n",
437            "2. In your workspace, modify src/api.rs: add a new function\n",
438            "   `pub fn handle_error(code: u16) -> String { format!(\"Error: {code}\") }`\n",
439            "3. Merge BOTH workspaces (agent-1 and agent-2) back, destroying them.\n",
440            "4. Confirm that both sets of changes are present in the main workspace:\n",
441            "   - src/auth.rs should contain an is_admin function\n",
442            "   - src/api.rs should contain a handle_error function\n",
443            "\n",
444            "Use only maw commands and file operations. Do not use git directly.\n",
445            "Use absolute paths for all file operations.\n",
446        ),
447        expected_outcomes: &[
448            "src/auth.rs contains is_admin function in ws/default/",
449            "src/api.rs contains handle_error function in ws/default/",
450            "workspace agent-1 no longer exists",
451            "workspace agent-2 no longer exists",
452        ],
453        expected_commands: &[
454            "maw ws create agent-2",
455            "edit src/api.rs (add handle_error)",
456            "maw ws merge agent-1 --destroy",
457            "maw ws merge agent-2 --destroy",
458        ],
459        optimal_command_count: 4,
460    }
461}
462
463/// S4: Conflict detection and resolution.
464///
465/// Tests the conflict handling workflow: two workspaces modify the same file,
466/// merge produces a conflict, agent must resolve it. This is the hardest
467/// scenario and validates error message quality.
468#[must_use]
469pub const fn scenario_conflict_resolution() -> Scenario {
470    Scenario {
471        id: ScenarioId::ConflictResolution,
472        name: "conflict-resolution",
473        tests: "Same-file conflict: detect, inspect, resolve, merge",
474        preconditions: Preconditions {
475            repo_state: concat!(
476                "Rust project with src/lib.rs. Two workspaces (left, right) both modify src/lib.rs. ",
477                "Workspace 'left' changes the add function body. ",
478                "Workspace 'right' also changes the add function body differently.",
479            ),
480            seed_files: &[
481                SeedFile {
482                    path: "Cargo.toml",
483                    content: concat!(
484                        "[package]\n",
485                        "name = \"agent-eval\"\n",
486                        "version = \"0.1.0\"\n",
487                        "edition = \"2021\"\n",
488                        "\n",
489                        "[dependencies]\n",
490                    ),
491                },
492                SeedFile {
493                    path: "src/main.rs",
494                    content: "fn main() {}\n",
495                },
496                SeedFile {
497                    path: "src/lib.rs",
498                    content: concat!(
499                        "pub fn add(a: i32, b: i32) -> i32 {\n",
500                        "    a + b\n",
501                        "}\n",
502                    ),
503                },
504            ],
505            existing_workspaces: &[
506                WorkspaceSetup {
507                    name: "left",
508                    files: &[SeedFile {
509                        path: "src/lib.rs",
510                        content: concat!(
511                            "pub fn add(a: i32, b: i32) -> i32 {\n",
512                            "    let result = a + b;\n",
513                            "    assert!(result >= a.min(b), \"overflow\");\n",
514                            "    result\n",
515                            "}\n",
516                        ),
517                    }],
518                },
519                WorkspaceSetup {
520                    name: "right",
521                    files: &[SeedFile {
522                        path: "src/lib.rs",
523                        content: concat!(
524                            "pub fn add(a: i32, b: i32) -> i32 {\n",
525                            "    a.checked_add(b).expect(\"overflow\")\n",
526                            "}\n",
527                        ),
528                    }],
529                },
530            ],
531        },
532        task_prompt: concat!(
533            "You are working on a Rust project managed by maw.\n",
534            "Two workspaces ('left' and 'right') both modified src/lib.rs.\n",
535            "\n",
536            "Task:\n",
537            "1. Try merging workspace 'left' — this should succeed.\n",
538            "2. Try merging workspace 'right' — this may produce a conflict.\n",
539            "3. If there is a conflict:\n",
540            "   a. Inspect the conflicted file to understand both sides.\n",
541            "   b. Resolve the conflict by keeping the checked_add approach from 'right'\n",
542            "      (it's the safer implementation).\n",
543            "   c. Complete the merge.\n",
544            "4. Confirm src/lib.rs in the main workspace uses checked_add.\n",
545            "5. Destroy both workspaces if not already destroyed.\n",
546            "\n",
547            "Use only maw commands and file operations. Do not use git directly.\n",
548            "Use absolute paths for all file operations.\n",
549        ),
550        expected_outcomes: &[
551            "src/lib.rs in ws/default/ uses checked_add",
552            "workspace left no longer exists",
553            "workspace right no longer exists",
554            "conflict was detected and resolved (not silently dropped)",
555        ],
556        expected_commands: &[
557            "maw ws merge left --destroy",
558            "maw ws merge right (conflict detected)",
559            "read/inspect conflicted file",
560            "edit src/lib.rs (resolve conflict)",
561            "maw ws merge right --destroy (or complete merge)",
562        ],
563        optimal_command_count: 5,
564    }
565}
566
567/// S5: Read-only inspection.
568///
569/// Tests the observation/inspection workflow: list workspaces, check status,
570/// get history. This validates that agents can gather information without
571/// modifying state — essential for debugging and coordination.
572#[must_use]
573pub const fn scenario_read_only_inspection() -> Scenario {
574    Scenario {
575        id: ScenarioId::ReadOnlyInspection,
576        name: "read-only-inspection",
577        tests: "Inspect workspace state: list, status, files — without modifying anything",
578        preconditions: Preconditions {
579            repo_state: concat!(
580                "Rust project with two active workspaces. ",
581                "Workspace 'alice' has modified src/lib.rs (dirty). ",
582                "Workspace 'bob' has no modifications (clean).",
583            ),
584            seed_files: &[
585                SeedFile {
586                    path: "Cargo.toml",
587                    content: concat!(
588                        "[package]\n",
589                        "name = \"agent-eval\"\n",
590                        "version = \"0.1.0\"\n",
591                        "edition = \"2021\"\n",
592                        "\n",
593                        "[dependencies]\n",
594                    ),
595                },
596                SeedFile {
597                    path: "src/main.rs",
598                    content: "fn main() {}\n",
599                },
600                SeedFile {
601                    path: "src/lib.rs",
602                    content: "pub fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n",
603                },
604            ],
605            existing_workspaces: &[
606                WorkspaceSetup {
607                    name: "alice",
608                    files: &[SeedFile {
609                        path: "src/lib.rs",
610                        content: concat!(
611                            "pub fn add(a: i32, b: i32) -> i32 {\n",
612                            "    a + b\n",
613                            "}\n",
614                            "\n",
615                            "pub fn subtract(a: i32, b: i32) -> i32 {\n",
616                            "    a - b\n",
617                            "}\n",
618                        ),
619                    }],
620                },
621                WorkspaceSetup {
622                    name: "bob",
623                    files: &[],
624                },
625            ],
626        },
627        task_prompt: concat!(
628            "You are a lead agent inspecting the state of a Rust project managed by maw.\n",
629            "DO NOT modify any files or merge anything — this is read-only inspection.\n",
630            "\n",
631            "Task:\n",
632            "1. List all workspaces. Report how many exist and their names.\n",
633            "2. Check the status of workspace 'alice'. Report whether it has dirty files.\n",
634            "3. Check the status of workspace 'bob'. Report whether it has dirty files.\n",
635            "4. Read the file src/lib.rs in alice's workspace. Report what function(s) it contains.\n",
636            "5. Read the file src/lib.rs in bob's workspace. Report what function(s) it contains.\n",
637            "\n",
638            "Output your findings as a structured summary.\n",
639            "\n",
640            "Use only maw commands and file read operations. Do not use git directly.\n",
641            "Do NOT merge, create, or destroy any workspaces.\n",
642        ),
643        expected_outcomes: &[
644            "agent lists 3 workspaces: default, alice, bob",
645            "agent reports alice has dirty files (src/lib.rs modified)",
646            "agent reports bob has no dirty files (clean)",
647            "agent reports alice's lib.rs has add and subtract functions",
648            "agent reports bob's lib.rs has only the add function",
649            "no workspaces were created, destroyed, or merged",
650        ],
651        expected_commands: &[
652            "maw ws list",
653            "maw ws status alice",
654            "maw ws status bob",
655            "read ws/alice/src/lib.rs",
656            "read ws/bob/src/lib.rs",
657        ],
658        optimal_command_count: 5,
659    }
660}
661
662// ---------------------------------------------------------------------------
663// Tests
664// ---------------------------------------------------------------------------
665
666#[cfg(test)]
667#[allow(clippy::all, clippy::pedantic, clippy::nursery)]
668mod tests {
669    use super::*;
670
671    #[test]
672    fn all_scenarios_returns_five() {
673        assert_eq!(all_scenarios().len(), SCENARIO_COUNT);
674    }
675
676    #[test]
677    fn scenario_ids_are_unique() {
678        let scenarios = all_scenarios();
679        let mut ids: Vec<_> = scenarios.iter().map(|s| s.id).collect();
680        let original_len = ids.len();
681        ids.dedup();
682        assert_eq!(ids.len(), original_len, "duplicate scenario IDs detected");
683    }
684
685    #[test]
686    fn scenario_names_are_unique() {
687        let scenarios = all_scenarios();
688        let mut names: Vec<_> = scenarios.iter().map(|s| s.name).collect();
689        let original_len = names.len();
690        names.sort_unstable();
691        names.dedup();
692        assert_eq!(
693            names.len(),
694            original_len,
695            "duplicate scenario names detected"
696        );
697    }
698
699    #[test]
700    fn each_scenario_has_nonempty_fields() {
701        for s in &all_scenarios() {
702            assert!(!s.name.is_empty(), "{:?} has empty name", s.id);
703            assert!(
704                !s.tests.is_empty(),
705                "{:?} has empty tests description",
706                s.id
707            );
708            assert!(
709                !s.task_prompt.is_empty(),
710                "{:?} has empty task_prompt",
711                s.id
712            );
713            assert!(
714                !s.expected_outcomes.is_empty(),
715                "{:?} has no expected outcomes",
716                s.id
717            );
718            assert!(
719                !s.expected_commands.is_empty(),
720                "{:?} has no expected commands",
721                s.id
722            );
723            assert!(
724                s.optimal_command_count > 0,
725                "{:?} has zero optimal command count",
726                s.id
727            );
728        }
729    }
730
731    #[test]
732    fn each_scenario_has_seed_files() {
733        for s in &all_scenarios() {
734            assert!(
735                !s.preconditions.seed_files.is_empty(),
736                "{:?} has no seed files",
737                s.id
738            );
739        }
740    }
741
742    #[test]
743    fn conflict_scenario_has_two_workspaces() {
744        let conflict = scenario_conflict_resolution();
745        assert_eq!(
746            conflict.preconditions.existing_workspaces.len(),
747            2,
748            "conflict scenario must have exactly 2 pre-existing workspaces"
749        );
750    }
751
752    #[test]
753    fn read_only_scenario_has_two_workspaces() {
754        let readonly = scenario_read_only_inspection();
755        assert_eq!(
756            readonly.preconditions.existing_workspaces.len(),
757            2,
758            "read-only scenario must have exactly 2 pre-existing workspaces"
759        );
760    }
761
762    #[test]
763    fn conflict_and_readonly_scenarios_present() {
764        let scenarios = all_scenarios();
765        let has_conflict = scenarios
766            .iter()
767            .any(|s| s.id == ScenarioId::ConflictResolution);
768        let has_readonly = scenarios
769            .iter()
770            .any(|s| s.id == ScenarioId::ReadOnlyInspection);
771        assert!(has_conflict, "must have a conflict handling scenario");
772        assert!(has_readonly, "must have a read-only inspection scenario");
773    }
774
775    #[test]
776    fn task_prompts_do_not_mention_vcs() {
777        let forbidden = ["git ", "branch", "commit", "checkout", "rebase"];
778        for s in &all_scenarios() {
779            for word in &forbidden {
780                // Allow "git" only in the "Do not use git" instruction
781                let prompt_lower = s.task_prompt.to_lowercase();
782                let occurrences: Vec<_> = prompt_lower.match_indices(word).collect();
783                for (idx, _) in &occurrences {
784                    // Check context: it's OK if it's in "do not use git"
785                    let start = idx.saturating_sub(20);
786                    let context = &prompt_lower[start..prompt_lower.len().min(idx + 30)];
787                    assert!(
788                        context.contains("do not use") || context.contains("don't use"),
789                        "{:?} task prompt mentions '{}' outside of prohibition context: ...{}...",
790                        s.id,
791                        word.trim(),
792                        context,
793                    );
794                }
795            }
796        }
797    }
798
799    #[test]
800    fn target_threshold_is_encoded() {
801        assert!(
802            TARGET_AVERAGE_SCORE > 0.0 && TARGET_AVERAGE_SCORE <= 5.0,
803            "target must be between 0 and 5"
804        );
805        // The bead specifies ≤ 1.5
806        assert!(
807            (TARGET_AVERAGE_SCORE - 1.5).abs() < f64::EPSILON,
808            "target must be 1.5 per bead spec"
809        );
810    }
811
812    // --- Scoring tests ---
813
814    #[test]
815    fn perfect_run_scores_1() {
816        let metrics = RunMetrics {
817            tool_calls: 5,
818            maw_commands: 3,
819            errors: 0,
820            retries: 0,
821            confusion_markers: 0,
822            goal_achieved: true,
823            recovery_steps: 0,
824        };
825        assert_eq!(metrics.score(), FrictionScore::Perfect);
826        assert_eq!(metrics.score().value(), 1);
827    }
828
829    #[test]
830    fn minor_error_scores_2() {
831        let metrics = RunMetrics {
832            tool_calls: 7,
833            maw_commands: 4,
834            errors: 1,
835            retries: 0,
836            confusion_markers: 0,
837            goal_achieved: true,
838            recovery_steps: 1,
839        };
840        assert_eq!(metrics.score(), FrictionScore::Minor);
841        assert_eq!(metrics.score().value(), 2);
842    }
843
844    #[test]
845    fn moderate_difficulty_scores_3() {
846        let metrics = RunMetrics {
847            tool_calls: 12,
848            maw_commands: 6,
849            errors: 2,
850            retries: 3,
851            confusion_markers: 2,
852            goal_achieved: true,
853            recovery_steps: 4,
854        };
855        assert_eq!(metrics.score(), FrictionScore::Moderate);
856        assert_eq!(metrics.score().value(), 3);
857    }
858
859    #[test]
860    fn difficult_scores_4() {
861        let metrics = RunMetrics {
862            tool_calls: 20,
863            maw_commands: 10,
864            errors: 5,
865            retries: 6,
866            confusion_markers: 4,
867            goal_achieved: true,
868            recovery_steps: 8,
869        };
870        assert_eq!(metrics.score(), FrictionScore::Difficult);
871        assert_eq!(metrics.score().value(), 4);
872    }
873
874    #[test]
875    fn failed_run_scores_5() {
876        let metrics = RunMetrics {
877            tool_calls: 15,
878            maw_commands: 8,
879            errors: 3,
880            retries: 2,
881            confusion_markers: 1,
882            goal_achieved: false,
883            recovery_steps: 5,
884        };
885        assert_eq!(metrics.score(), FrictionScore::Failed);
886        assert_eq!(metrics.score().value(), 5);
887    }
888
889    #[test]
890    fn eval_report_passes_below_threshold() {
891        let results = vec![
892            ScenarioResult {
893                scenario_id: ScenarioId::BasicLifecycle,
894                metrics: RunMetrics {
895                    goal_achieved: true,
896                    ..Default::default()
897                },
898                score: FrictionScore::Perfect,
899                passed: true,
900            },
901            ScenarioResult {
902                scenario_id: ScenarioId::MultiFileEdit,
903                metrics: RunMetrics {
904                    goal_achieved: true,
905                    errors: 1,
906                    ..Default::default()
907                },
908                score: FrictionScore::Minor,
909                passed: true,
910            },
911        ];
912        let report = EvalReport::from_results(results);
913        assert_eq!(report.average_score, 1.5);
914        assert!(report.passed);
915    }
916
917    #[test]
918    fn eval_report_fails_above_threshold() {
919        let results = vec![
920            ScenarioResult {
921                scenario_id: ScenarioId::BasicLifecycle,
922                metrics: RunMetrics {
923                    goal_achieved: true,
924                    ..Default::default()
925                },
926                score: FrictionScore::Perfect,
927                passed: true,
928            },
929            ScenarioResult {
930                scenario_id: ScenarioId::ConflictResolution,
931                metrics: RunMetrics {
932                    goal_achieved: true,
933                    errors: 3,
934                    retries: 4,
935                    confusion_markers: 5,
936                    ..Default::default()
937                },
938                score: FrictionScore::Difficult,
939                passed: false,
940            },
941        ];
942        let report = EvalReport::from_results(results);
943        assert_eq!(report.average_score, 2.5);
944        assert!(!report.passed);
945    }
946
947    #[test]
948    fn scenarios_serialize_to_json() {
949        let scenarios = all_scenarios();
950        for s in &scenarios {
951            let json = serde_json::to_string(s).expect("scenario should serialize");
952            assert!(!json.is_empty());
953            // Verify it's valid JSON by parsing to Value
954            let value: serde_json::Value =
955                serde_json::from_str(&json).expect("should be valid JSON");
956            assert!(value.is_object(), "scenario JSON should be an object");
957            assert!(
958                value.get("id").is_some(),
959                "{:?} JSON missing 'id' field",
960                s.id
961            );
962            assert!(
963                value.get("task_prompt").is_some(),
964                "{:?} JSON missing 'task_prompt' field",
965                s.id
966            );
967        }
968    }
969
970    #[test]
971    fn scoring_rubric_is_monotonic() {
972        // Scores 1..5 must be strictly ordered
973        assert!(FrictionScore::Perfect < FrictionScore::Minor);
974        assert!(FrictionScore::Minor < FrictionScore::Moderate);
975        assert!(FrictionScore::Moderate < FrictionScore::Difficult);
976        assert!(FrictionScore::Difficult < FrictionScore::Failed);
977    }
978}