Skip to main content

ralph_core/
task_definition.rs

1//! Task definition types for benchmark harness.
2//!
3//! Defines the JSON schema for benchmark tasks, including setup, verification,
4//! and metrics collection. Tasks run in isolated workspaces with their own
5//! `.git` directories to avoid polluting the main repository.
6//!
7//! # Example
8//!
9//! ```
10//! use ralph_core::task_definition::{TaskDefinition, TaskSuite, Verification};
11//!
12//! let task = TaskDefinition::builder("hello-world", "tasks/hello-world/PROMPT.md", "TASK_COMPLETE")
13//!     .verification_command("python hello.py | grep -q 'Hello, World!'")
14//!     .max_iterations(5)
15//!     .expected_iterations(1)
16//!     .complexity("simple")
17//!     .build();
18//!
19//! assert_eq!(task.name, "hello-world");
20//! assert!(task.verification.command.contains("Hello, World!"));
21//! ```
22
23use serde::{Deserialize, Serialize};
24use std::path::Path;
25
26/// A suite of benchmark tasks loaded from a JSON file.
27///
28/// The suite contains multiple tasks that can be run sequentially during
29/// batch benchmarking.
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct TaskSuite {
32    /// List of task definitions.
33    pub tasks: Vec<TaskDefinition>,
34
35    /// Optional suite-level metadata.
36    #[serde(default)]
37    pub metadata: SuiteMetadata,
38}
39
40impl TaskSuite {
41    /// Loads a task suite from a JSON file.
42    pub fn from_file(path: impl AsRef<Path>) -> Result<Self, TaskDefinitionError> {
43        let path_ref = path.as_ref();
44        let content = std::fs::read_to_string(path_ref)?;
45        let suite: Self = serde_json::from_str(&content)?;
46        suite.validate()?;
47        Ok(suite)
48    }
49
50    /// Validates all tasks in the suite.
51    pub fn validate(&self) -> Result<(), TaskDefinitionError> {
52        if self.tasks.is_empty() {
53            return Err(TaskDefinitionError::Validation(
54                "Task suite must contain at least one task".to_string(),
55            ));
56        }
57
58        for task in &self.tasks {
59            task.validate()?;
60        }
61
62        // Check for duplicate names
63        let mut names = std::collections::HashSet::new();
64        for task in &self.tasks {
65            if !names.insert(&task.name) {
66                return Err(TaskDefinitionError::Validation(format!(
67                    "Duplicate task name: '{}'",
68                    task.name
69                )));
70            }
71        }
72
73        Ok(())
74    }
75
76    /// Returns tasks filtered by complexity level.
77    pub fn filter_by_complexity(&self, complexity: &str) -> Vec<&TaskDefinition> {
78        self.tasks
79            .iter()
80            .filter(|t| t.complexity == complexity)
81            .collect()
82    }
83
84    /// Returns tasks filtered by tag.
85    pub fn filter_by_tag(&self, tag: &str) -> Vec<&TaskDefinition> {
86        self.tasks
87            .iter()
88            .filter(|t| t.tags.iter().any(|t| t == tag))
89            .collect()
90    }
91}
92
93/// Suite-level metadata.
94#[derive(Debug, Clone, Default, Serialize, Deserialize)]
95pub struct SuiteMetadata {
96    /// Optional suite name.
97    pub name: Option<String>,
98
99    /// Optional description.
100    pub description: Option<String>,
101
102    /// Suite version.
103    pub version: Option<String>,
104}
105
106/// A single benchmark task definition.
107///
108/// Tasks define what the agent should accomplish, how to verify success,
109/// and optional setup requirements. Each task runs in an isolated workspace.
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct TaskDefinition {
112    // ─────────────────────────────────────────────────────────────────────────
113    // REQUIRED FIELDS
114    // ─────────────────────────────────────────────────────────────────────────
115
116    /// Unique task identifier (alphanumeric + hyphens).
117    ///
118    /// Used for recording filenames and result reporting.
119    pub name: String,
120
121    /// Path to the prompt markdown file.
122    ///
123    /// Relative to the task suite file or absolute path.
124    pub prompt_file: String,
125
126    /// String the agent outputs when task is complete.
127    ///
128    /// This is detected by the orchestration loop to terminate the task.
129    pub completion_promise: String,
130
131    /// Verification configuration for confirming task success.
132    pub verification: Verification,
133
134    // ─────────────────────────────────────────────────────────────────────────
135    // OPTIONAL FIELDS
136    // ─────────────────────────────────────────────────────────────────────────
137
138    /// Human-readable description of the task.
139    #[serde(default)]
140    pub description: Option<String>,
141
142    /// Task complexity level: "simple", "medium", or "complex".
143    ///
144    /// Used for filtering and baseline comparisons.
145    #[serde(default = "default_complexity")]
146    pub complexity: String,
147
148    /// Maximum iterations before the task is considered failed.
149    ///
150    /// Safety limit to prevent runaway loops.
151    #[serde(default = "default_max_iterations")]
152    pub max_iterations: u32,
153
154    /// Expected number of iterations for baseline comparison.
155    ///
156    /// Used to calculate `iteration_delta` in results.
157    #[serde(default)]
158    pub expected_iterations: Option<u32>,
159
160    /// Timeout in seconds for the entire task.
161    #[serde(default = "default_timeout_seconds")]
162    pub timeout_seconds: u64,
163
164    /// Setup configuration for the task workspace.
165    #[serde(default)]
166    pub setup: TaskSetup,
167
168    /// Tags for filtering and categorization.
169    #[serde(default)]
170    pub tags: Vec<String>,
171}
172
173fn default_complexity() -> String {
174    "medium".to_string()
175}
176
177fn default_max_iterations() -> u32 {
178    100
179}
180
181fn default_timeout_seconds() -> u64 {
182    300 // 5 minutes
183}
184
185impl TaskDefinition {
186    /// Creates a builder for constructing task definitions.
187    pub fn builder(
188        name: impl Into<String>,
189        prompt_file: impl Into<String>,
190        completion_promise: impl Into<String>,
191    ) -> TaskDefinitionBuilder {
192        TaskDefinitionBuilder::new(name, prompt_file, completion_promise)
193    }
194
195    /// Validates the task definition.
196    pub fn validate(&self) -> Result<(), TaskDefinitionError> {
197        // Validate name format (alphanumeric + hyphens)
198        if self.name.is_empty() {
199            return Err(TaskDefinitionError::MissingField("name".to_string()));
200        }
201
202        if !self
203            .name
204            .chars()
205            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
206        {
207            return Err(TaskDefinitionError::Validation(format!(
208                "Task name '{}' contains invalid characters. Use alphanumeric, hyphens, or underscores only.",
209                self.name
210            )));
211        }
212
213        // Validate prompt_file is not empty
214        if self.prompt_file.is_empty() {
215            return Err(TaskDefinitionError::MissingField("prompt_file".to_string()));
216        }
217
218        // Validate completion_promise is not empty
219        if self.completion_promise.is_empty() {
220            return Err(TaskDefinitionError::MissingField(
221                "completion_promise".to_string(),
222            ));
223        }
224
225        // Validate verification command is not empty
226        if self.verification.command.is_empty() {
227            return Err(TaskDefinitionError::MissingField(
228                "verification.command".to_string(),
229            ));
230        }
231
232        // Validate complexity is valid
233        if !["simple", "medium", "complex"].contains(&self.complexity.as_str()) {
234            return Err(TaskDefinitionError::Validation(format!(
235                "Invalid complexity '{}'. Must be one of: simple, medium, complex",
236                self.complexity
237            )));
238        }
239
240        Ok(())
241    }
242
243    /// Returns the iteration delta if expected_iterations is set.
244    ///
245    /// `delta = actual - expected` (positive means took more iterations)
246    pub fn iteration_delta(&self, actual: u32) -> Option<i32> {
247        self.expected_iterations
248            .map(|expected| actual as i32 - expected as i32)
249    }
250}
251
252/// Builder for constructing task definitions.
253pub struct TaskDefinitionBuilder {
254    name: String,
255    prompt_file: String,
256    completion_promise: String,
257    verification: Verification,
258    description: Option<String>,
259    complexity: String,
260    max_iterations: u32,
261    expected_iterations: Option<u32>,
262    timeout_seconds: u64,
263    setup: TaskSetup,
264    tags: Vec<String>,
265}
266
267impl TaskDefinitionBuilder {
268    /// Creates a new builder with required fields.
269    pub fn new(
270        name: impl Into<String>,
271        prompt_file: impl Into<String>,
272        completion_promise: impl Into<String>,
273    ) -> Self {
274        Self {
275            name: name.into(),
276            prompt_file: prompt_file.into(),
277            completion_promise: completion_promise.into(),
278            verification: Verification::default(),
279            description: None,
280            complexity: default_complexity(),
281            max_iterations: default_max_iterations(),
282            expected_iterations: None,
283            timeout_seconds: default_timeout_seconds(),
284            setup: TaskSetup::default(),
285            tags: Vec::new(),
286        }
287    }
288
289    /// Sets the verification command.
290    pub fn verification_command(mut self, command: impl Into<String>) -> Self {
291        self.verification.command = command.into();
292        self
293    }
294
295    /// Sets the verification success exit code.
296    pub fn verification_exit_code(mut self, code: i32) -> Self {
297        self.verification.success_exit_code = code;
298        self
299    }
300
301    /// Sets the full verification configuration.
302    pub fn verification(mut self, verification: Verification) -> Self {
303        self.verification = verification;
304        self
305    }
306
307    /// Sets the task description.
308    pub fn description(mut self, description: impl Into<String>) -> Self {
309        self.description = Some(description.into());
310        self
311    }
312
313    /// Sets the complexity level.
314    pub fn complexity(mut self, complexity: impl Into<String>) -> Self {
315        self.complexity = complexity.into();
316        self
317    }
318
319    /// Sets the maximum iterations.
320    pub fn max_iterations(mut self, max: u32) -> Self {
321        self.max_iterations = max;
322        self
323    }
324
325    /// Sets the expected iterations for baseline comparison.
326    pub fn expected_iterations(mut self, expected: u32) -> Self {
327        self.expected_iterations = Some(expected);
328        self
329    }
330
331    /// Sets the timeout in seconds.
332    pub fn timeout_seconds(mut self, seconds: u64) -> Self {
333        self.timeout_seconds = seconds;
334        self
335    }
336
337    /// Sets the setup configuration.
338    pub fn setup(mut self, setup: TaskSetup) -> Self {
339        self.setup = setup;
340        self
341    }
342
343    /// Sets the setup script.
344    pub fn setup_script(mut self, script: impl Into<String>) -> Self {
345        self.setup.script = Some(script.into());
346        self
347    }
348
349    /// Sets the setup files.
350    pub fn setup_files(mut self, files: Vec<String>) -> Self {
351        self.setup.files = files;
352        self
353    }
354
355    /// Adds tags.
356    pub fn tags(mut self, tags: Vec<String>) -> Self {
357        self.tags = tags;
358        self
359    }
360
361    /// Adds a single tag.
362    pub fn tag(mut self, tag: impl Into<String>) -> Self {
363        self.tags.push(tag.into());
364        self
365    }
366
367    /// Builds the task definition.
368    pub fn build(self) -> TaskDefinition {
369        TaskDefinition {
370            name: self.name,
371            prompt_file: self.prompt_file,
372            completion_promise: self.completion_promise,
373            verification: self.verification,
374            description: self.description,
375            complexity: self.complexity,
376            max_iterations: self.max_iterations,
377            expected_iterations: self.expected_iterations,
378            timeout_seconds: self.timeout_seconds,
379            setup: self.setup,
380            tags: self.tags,
381        }
382    }
383}
384
385/// Verification configuration for a task.
386#[derive(Debug, Clone, Default, Serialize, Deserialize)]
387pub struct Verification {
388    /// Bash command to verify task success.
389    ///
390    /// Runs in the task workspace after completion promise is detected.
391    #[serde(default)]
392    pub command: String,
393
394    /// Exit code that indicates success (default: 0).
395    #[serde(default)]
396    pub success_exit_code: i32,
397}
398
399impl Verification {
400    /// Creates a new verification with the given command.
401    pub fn new(command: impl Into<String>) -> Self {
402        Self {
403            command: command.into(),
404            success_exit_code: 0,
405        }
406    }
407
408    /// Creates a verification that expects a non-zero exit code.
409    pub fn expect_failure(command: impl Into<String>, exit_code: i32) -> Self {
410        Self {
411            command: command.into(),
412            success_exit_code: exit_code,
413        }
414    }
415}
416
417/// Setup configuration for task workspace.
418#[derive(Debug, Clone, Default, Serialize, Deserialize)]
419pub struct TaskSetup {
420    /// Script to run before the task starts.
421    ///
422    /// Executed in the task workspace directory.
423    #[serde(default)]
424    pub script: Option<String>,
425
426    /// Files to copy to the task workspace.
427    ///
428    /// Paths relative to the task suite file.
429    #[serde(default)]
430    pub files: Vec<String>,
431}
432
433impl TaskSetup {
434    /// Returns true if there is any setup to perform.
435    pub fn has_setup(&self) -> bool {
436        self.script.is_some() || !self.files.is_empty()
437    }
438}
439
440/// Errors that can occur when working with task definitions.
441#[derive(Debug, thiserror::Error)]
442pub enum TaskDefinitionError {
443    /// IO error reading task file.
444    #[error("IO error: {0}")]
445    Io(#[from] std::io::Error),
446
447    /// JSON parse error.
448    #[error("JSON parse error: {0}")]
449    Json(#[from] serde_json::Error),
450
451    /// Missing required field.
452    #[error("Missing required field: {0}")]
453    MissingField(String),
454
455    /// Validation error.
456    #[error("Validation error: {0}")]
457    Validation(String),
458}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463
464    #[test]
465    fn test_task_definition_builder() {
466        let task = TaskDefinition::builder("hello-world", "tasks/hello.md", "TASK_COMPLETE")
467            .verification_command("python hello.py | grep -q 'Hello, World!'")
468            .description("Create a hello world script")
469            .complexity("simple")
470            .max_iterations(5)
471            .expected_iterations(1)
472            .tag("python")
473            .build();
474
475        assert_eq!(task.name, "hello-world");
476        assert_eq!(task.prompt_file, "tasks/hello.md");
477        assert_eq!(task.completion_promise, "TASK_COMPLETE");
478        assert!(task.verification.command.contains("Hello, World!"));
479        assert_eq!(task.complexity, "simple");
480        assert_eq!(task.max_iterations, 5);
481        assert_eq!(task.expected_iterations, Some(1));
482        assert!(task.tags.contains(&"python".to_string()));
483    }
484
485    #[test]
486    fn test_task_definition_defaults() {
487        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
488            .verification_command("echo ok")
489            .build();
490
491        assert_eq!(task.complexity, "medium");
492        assert_eq!(task.max_iterations, 100);
493        assert_eq!(task.timeout_seconds, 300);
494        assert!(task.expected_iterations.is_none());
495        assert!(task.tags.is_empty());
496    }
497
498    #[test]
499    fn test_task_validation_valid() {
500        let task = TaskDefinition::builder("valid-task", "prompt.md", "DONE")
501            .verification_command("echo ok")
502            .build();
503
504        assert!(task.validate().is_ok());
505    }
506
507    #[test]
508    fn test_task_validation_invalid_name() {
509        let task = TaskDefinition::builder("invalid task name!", "prompt.md", "DONE")
510            .verification_command("echo ok")
511            .build();
512
513        let err = task.validate().unwrap_err();
514        assert!(matches!(err, TaskDefinitionError::Validation(_)));
515    }
516
517    #[test]
518    fn test_task_validation_empty_prompt() {
519        let task = TaskDefinition::builder("test", "", "DONE")
520            .verification_command("echo ok")
521            .build();
522
523        let err = task.validate().unwrap_err();
524        assert!(matches!(err, TaskDefinitionError::MissingField(f) if f == "prompt_file"));
525    }
526
527    #[test]
528    fn test_task_validation_empty_verification() {
529        let task = TaskDefinition::builder("test", "prompt.md", "DONE").build();
530
531        let err = task.validate().unwrap_err();
532        assert!(matches!(err, TaskDefinitionError::MissingField(f) if f == "verification.command"));
533    }
534
535    #[test]
536    fn test_task_validation_invalid_complexity() {
537        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
538            .verification_command("echo ok")
539            .complexity("invalid")
540            .build();
541
542        let err = task.validate().unwrap_err();
543        assert!(matches!(err, TaskDefinitionError::Validation(_)));
544    }
545
546    #[test]
547    fn test_iteration_delta() {
548        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
549            .verification_command("echo ok")
550            .expected_iterations(5)
551            .build();
552
553        // Took fewer iterations than expected
554        assert_eq!(task.iteration_delta(3), Some(-2));
555
556        // Took more iterations than expected
557        assert_eq!(task.iteration_delta(7), Some(2));
558
559        // Took exactly expected
560        assert_eq!(task.iteration_delta(5), Some(0));
561    }
562
563    #[test]
564    fn test_iteration_delta_no_expected() {
565        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
566            .verification_command("echo ok")
567            .build();
568
569        assert!(task.iteration_delta(5).is_none());
570    }
571
572    #[test]
573    fn test_task_suite_parse() {
574        let json = r#"{
575            "tasks": [
576                {
577                    "name": "hello-world",
578                    "prompt_file": "tasks/hello/PROMPT.md",
579                    "completion_promise": "TASK_COMPLETE",
580                    "verification": {
581                        "command": "python hello.py | grep -q 'Hello, World!'"
582                    },
583                    "complexity": "simple",
584                    "max_iterations": 5,
585                    "expected_iterations": 1
586                },
587                {
588                    "name": "fizzbuzz-tdd",
589                    "description": "Implement FizzBuzz with TDD",
590                    "prompt_file": "tasks/fizzbuzz/PROMPT.md",
591                    "completion_promise": "TESTS_PASSING",
592                    "verification": {
593                        "command": "pytest test_fizzbuzz.py -v"
594                    },
595                    "complexity": "medium",
596                    "max_iterations": 15,
597                    "expected_iterations": 5,
598                    "setup": {
599                        "files": ["test_fizzbuzz.py"]
600                    },
601                    "tags": ["python", "tdd"]
602                }
603            ],
604            "metadata": {
605                "name": "Ralph Benchmark Suite",
606                "version": "1.0.0"
607            }
608        }"#;
609
610        let suite: TaskSuite = serde_json::from_str(json).unwrap();
611        assert_eq!(suite.tasks.len(), 2);
612
613        let hello = &suite.tasks[0];
614        assert_eq!(hello.name, "hello-world");
615        assert_eq!(hello.complexity, "simple");
616        assert_eq!(hello.max_iterations, 5);
617        assert_eq!(hello.expected_iterations, Some(1));
618
619        let fizzbuzz = &suite.tasks[1];
620        assert_eq!(fizzbuzz.name, "fizzbuzz-tdd");
621        assert!(fizzbuzz.description.is_some());
622        assert_eq!(fizzbuzz.setup.files.len(), 1);
623        assert!(fizzbuzz.tags.contains(&"tdd".to_string()));
624
625        assert_eq!(suite.metadata.name, Some("Ralph Benchmark Suite".to_string()));
626    }
627
628    #[test]
629    fn test_task_suite_validation_empty() {
630        let suite = TaskSuite {
631            tasks: vec![],
632            metadata: SuiteMetadata::default(),
633        };
634
635        let err = suite.validate().unwrap_err();
636        assert!(matches!(err, TaskDefinitionError::Validation(_)));
637    }
638
639    #[test]
640    fn test_task_suite_validation_duplicates() {
641        let task = TaskDefinition::builder("duplicate", "prompt.md", "DONE")
642            .verification_command("echo ok")
643            .build();
644
645        let suite = TaskSuite {
646            tasks: vec![task.clone(), task],
647            metadata: SuiteMetadata::default(),
648        };
649
650        let err = suite.validate().unwrap_err();
651        assert!(err.to_string().contains("Duplicate task name"));
652    }
653
654    #[test]
655    fn test_filter_by_complexity() {
656        let json = r#"{
657            "tasks": [
658                {"name": "t1", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "complexity": "simple"},
659                {"name": "t2", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "complexity": "medium"},
660                {"name": "t3", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "complexity": "simple"}
661            ]
662        }"#;
663
664        let suite: TaskSuite = serde_json::from_str(json).unwrap();
665        let simple = suite.filter_by_complexity("simple");
666        assert_eq!(simple.len(), 2);
667        assert!(simple.iter().all(|t| t.complexity == "simple"));
668    }
669
670    #[test]
671    fn test_filter_by_tag() {
672        let json = r#"{
673            "tasks": [
674                {"name": "t1", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "tags": ["python", "testing"]},
675                {"name": "t2", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "tags": ["rust"]},
676                {"name": "t3", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "tags": ["python"]}
677            ]
678        }"#;
679
680        let suite: TaskSuite = serde_json::from_str(json).unwrap();
681        let python = suite.filter_by_tag("python");
682        assert_eq!(python.len(), 2);
683    }
684
685    #[test]
686    fn test_setup_has_setup() {
687        let empty = TaskSetup::default();
688        assert!(!empty.has_setup());
689
690        let with_script = TaskSetup {
691            script: Some("setup.sh".to_string()),
692            files: vec![],
693        };
694        assert!(with_script.has_setup());
695
696        let with_files = TaskSetup {
697            script: None,
698            files: vec!["file.py".to_string()],
699        };
700        assert!(with_files.has_setup());
701    }
702
703    #[test]
704    fn test_verification_new() {
705        let v = Verification::new("pytest tests/");
706        assert_eq!(v.command, "pytest tests/");
707        assert_eq!(v.success_exit_code, 0);
708    }
709
710    #[test]
711    fn test_verification_expect_failure() {
712        let v = Verification::expect_failure("false", 1);
713        assert_eq!(v.command, "false");
714        assert_eq!(v.success_exit_code, 1);
715    }
716}