Skip to main content

ralph_core/
task_definition.rs

1//! Task definition types for benchmark harness.
2//!
3//! Defines the JSON schema for benchmark tasks, including setup, verification,
4//! and metrics collection. Tasks run in isolated workspaces with their own
5//! `.git` directories to avoid polluting the main repository.
6//!
7//! # Example
8//!
9//! ```
10//! use ralph_core::task_definition::{TaskDefinition, TaskSuite, Verification};
11//!
12//! let task = TaskDefinition::builder("hello-world", "tasks/hello-world/PROMPT.md", "TASK_COMPLETE")
13//!     .verification_command("python hello.py | grep -q 'Hello, World!'")
14//!     .max_iterations(5)
15//!     .expected_iterations(1)
16//!     .complexity("simple")
17//!     .build();
18//!
19//! assert_eq!(task.name, "hello-world");
20//! assert!(task.verification.command.contains("Hello, World!"));
21//! ```
22
23use serde::{Deserialize, Serialize};
24use std::path::Path;
25
26/// A suite of benchmark tasks loaded from a JSON file.
27///
28/// The suite contains multiple tasks that can be run sequentially during
29/// batch benchmarking.
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct TaskSuite {
32    /// List of task definitions.
33    pub tasks: Vec<TaskDefinition>,
34
35    /// Optional suite-level metadata.
36    #[serde(default)]
37    pub metadata: SuiteMetadata,
38}
39
40impl TaskSuite {
41    /// Loads a task suite from a JSON file.
42    pub fn from_file(path: impl AsRef<Path>) -> Result<Self, TaskDefinitionError> {
43        let path_ref = path.as_ref();
44        let content = std::fs::read_to_string(path_ref)?;
45        let suite: Self = serde_json::from_str(&content)?;
46        suite.validate()?;
47        Ok(suite)
48    }
49
50    /// Validates all tasks in the suite.
51    pub fn validate(&self) -> Result<(), TaskDefinitionError> {
52        if self.tasks.is_empty() {
53            return Err(TaskDefinitionError::Validation(
54                "Task suite must contain at least one task".to_string(),
55            ));
56        }
57
58        for task in &self.tasks {
59            task.validate()?;
60        }
61
62        // Check for duplicate names
63        let mut names = std::collections::HashSet::new();
64        for task in &self.tasks {
65            if !names.insert(&task.name) {
66                return Err(TaskDefinitionError::Validation(format!(
67                    "Duplicate task name: '{}'",
68                    task.name
69                )));
70            }
71        }
72
73        Ok(())
74    }
75
76    /// Returns tasks filtered by complexity level.
77    pub fn filter_by_complexity(&self, complexity: &str) -> Vec<&TaskDefinition> {
78        self.tasks
79            .iter()
80            .filter(|t| t.complexity == complexity)
81            .collect()
82    }
83
84    /// Returns tasks filtered by tag.
85    pub fn filter_by_tag(&self, tag: &str) -> Vec<&TaskDefinition> {
86        self.tasks
87            .iter()
88            .filter(|t| t.tags.iter().any(|t| t == tag))
89            .collect()
90    }
91}
92
93/// Suite-level metadata.
94#[derive(Debug, Clone, Default, Serialize, Deserialize)]
95pub struct SuiteMetadata {
96    /// Optional suite name.
97    pub name: Option<String>,
98
99    /// Optional description.
100    pub description: Option<String>,
101
102    /// Suite version.
103    pub version: Option<String>,
104}
105
106/// A single benchmark task definition.
107///
108/// Tasks define what the agent should accomplish, how to verify success,
109/// and optional setup requirements. Each task runs in an isolated workspace.
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct TaskDefinition {
112    // ─────────────────────────────────────────────────────────────────────────
113    // REQUIRED FIELDS
114    // ─────────────────────────────────────────────────────────────────────────
115    /// Unique task identifier (alphanumeric + hyphens).
116    ///
117    /// Used for recording filenames and result reporting.
118    pub name: String,
119
120    /// Path to the prompt markdown file.
121    ///
122    /// Relative to the task suite file or absolute path.
123    pub prompt_file: String,
124
125    /// String the agent outputs when task is complete.
126    ///
127    /// This is detected by the orchestration loop to terminate the task.
128    pub completion_promise: String,
129
130    /// Verification configuration for confirming task success.
131    pub verification: Verification,
132
133    // ─────────────────────────────────────────────────────────────────────────
134    // OPTIONAL FIELDS
135    // ─────────────────────────────────────────────────────────────────────────
136    /// Human-readable description of the task.
137    #[serde(default)]
138    pub description: Option<String>,
139
140    /// Task complexity level: "simple", "medium", or "complex".
141    ///
142    /// Used for filtering and baseline comparisons.
143    #[serde(default = "default_complexity")]
144    pub complexity: String,
145
146    /// Maximum iterations before the task is considered failed.
147    ///
148    /// Safety limit to prevent runaway loops.
149    #[serde(default = "default_max_iterations")]
150    pub max_iterations: u32,
151
152    /// Expected number of iterations for baseline comparison.
153    ///
154    /// Used to calculate `iteration_delta` in results.
155    #[serde(default)]
156    pub expected_iterations: Option<u32>,
157
158    /// Timeout in seconds for the entire task.
159    #[serde(default = "default_timeout_seconds")]
160    pub timeout_seconds: u64,
161
162    /// Setup configuration for the task workspace.
163    #[serde(default)]
164    pub setup: TaskSetup,
165
166    /// Tags for filtering and categorization.
167    #[serde(default)]
168    pub tags: Vec<String>,
169}
170
171fn default_complexity() -> String {
172    "medium".to_string()
173}
174
175fn default_max_iterations() -> u32 {
176    100
177}
178
179fn default_timeout_seconds() -> u64 {
180    300 // 5 minutes
181}
182
183impl TaskDefinition {
184    /// Creates a builder for constructing task definitions.
185    pub fn builder(
186        name: impl Into<String>,
187        prompt_file: impl Into<String>,
188        completion_promise: impl Into<String>,
189    ) -> TaskDefinitionBuilder {
190        TaskDefinitionBuilder::new(name, prompt_file, completion_promise)
191    }
192
193    /// Validates the task definition.
194    pub fn validate(&self) -> Result<(), TaskDefinitionError> {
195        // Validate name format (alphanumeric + hyphens)
196        if self.name.is_empty() {
197            return Err(TaskDefinitionError::MissingField("name".to_string()));
198        }
199
200        if !self
201            .name
202            .chars()
203            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
204        {
205            return Err(TaskDefinitionError::Validation(format!(
206                "Task name '{}' contains invalid characters. Use alphanumeric, hyphens, or underscores only.",
207                self.name
208            )));
209        }
210
211        // Validate prompt_file is not empty
212        if self.prompt_file.is_empty() {
213            return Err(TaskDefinitionError::MissingField("prompt_file".to_string()));
214        }
215
216        // Validate completion_promise is not empty
217        if self.completion_promise.is_empty() {
218            return Err(TaskDefinitionError::MissingField(
219                "completion_promise".to_string(),
220            ));
221        }
222
223        // Validate verification command is not empty
224        if self.verification.command.is_empty() {
225            return Err(TaskDefinitionError::MissingField(
226                "verification.command".to_string(),
227            ));
228        }
229
230        // Validate complexity is valid
231        if !["simple", "medium", "complex"].contains(&self.complexity.as_str()) {
232            return Err(TaskDefinitionError::Validation(format!(
233                "Invalid complexity '{}'. Must be one of: simple, medium, complex",
234                self.complexity
235            )));
236        }
237
238        Ok(())
239    }
240
241    /// Returns the iteration delta if expected_iterations is set.
242    ///
243    /// `delta = actual - expected` (positive means took more iterations)
244    pub fn iteration_delta(&self, actual: u32) -> Option<i32> {
245        self.expected_iterations
246            .map(|expected| actual as i32 - expected as i32)
247    }
248}
249
250/// Builder for constructing task definitions.
251pub struct TaskDefinitionBuilder {
252    name: String,
253    prompt_file: String,
254    completion_promise: String,
255    verification: Verification,
256    description: Option<String>,
257    complexity: String,
258    max_iterations: u32,
259    expected_iterations: Option<u32>,
260    timeout_seconds: u64,
261    setup: TaskSetup,
262    tags: Vec<String>,
263}
264
265impl TaskDefinitionBuilder {
266    /// Creates a new builder with required fields.
267    pub fn new(
268        name: impl Into<String>,
269        prompt_file: impl Into<String>,
270        completion_promise: impl Into<String>,
271    ) -> Self {
272        Self {
273            name: name.into(),
274            prompt_file: prompt_file.into(),
275            completion_promise: completion_promise.into(),
276            verification: Verification::default(),
277            description: None,
278            complexity: default_complexity(),
279            max_iterations: default_max_iterations(),
280            expected_iterations: None,
281            timeout_seconds: default_timeout_seconds(),
282            setup: TaskSetup::default(),
283            tags: Vec::new(),
284        }
285    }
286
287    /// Sets the verification command.
288    pub fn verification_command(mut self, command: impl Into<String>) -> Self {
289        self.verification.command = command.into();
290        self
291    }
292
293    /// Sets the verification success exit code.
294    pub fn verification_exit_code(mut self, code: i32) -> Self {
295        self.verification.success_exit_code = code;
296        self
297    }
298
299    /// Sets the full verification configuration.
300    pub fn verification(mut self, verification: Verification) -> Self {
301        self.verification = verification;
302        self
303    }
304
305    /// Sets the task description.
306    pub fn description(mut self, description: impl Into<String>) -> Self {
307        self.description = Some(description.into());
308        self
309    }
310
311    /// Sets the complexity level.
312    pub fn complexity(mut self, complexity: impl Into<String>) -> Self {
313        self.complexity = complexity.into();
314        self
315    }
316
317    /// Sets the maximum iterations.
318    pub fn max_iterations(mut self, max: u32) -> Self {
319        self.max_iterations = max;
320        self
321    }
322
323    /// Sets the expected iterations for baseline comparison.
324    pub fn expected_iterations(mut self, expected: u32) -> Self {
325        self.expected_iterations = Some(expected);
326        self
327    }
328
329    /// Sets the timeout in seconds.
330    pub fn timeout_seconds(mut self, seconds: u64) -> Self {
331        self.timeout_seconds = seconds;
332        self
333    }
334
335    /// Sets the setup configuration.
336    pub fn setup(mut self, setup: TaskSetup) -> Self {
337        self.setup = setup;
338        self
339    }
340
341    /// Sets the setup script.
342    pub fn setup_script(mut self, script: impl Into<String>) -> Self {
343        self.setup.script = Some(script.into());
344        self
345    }
346
347    /// Sets the setup files.
348    pub fn setup_files(mut self, files: Vec<String>) -> Self {
349        self.setup.files = files;
350        self
351    }
352
353    /// Adds tags.
354    pub fn tags(mut self, tags: Vec<String>) -> Self {
355        self.tags = tags;
356        self
357    }
358
359    /// Adds a single tag.
360    pub fn tag(mut self, tag: impl Into<String>) -> Self {
361        self.tags.push(tag.into());
362        self
363    }
364
365    /// Builds the task definition.
366    pub fn build(self) -> TaskDefinition {
367        TaskDefinition {
368            name: self.name,
369            prompt_file: self.prompt_file,
370            completion_promise: self.completion_promise,
371            verification: self.verification,
372            description: self.description,
373            complexity: self.complexity,
374            max_iterations: self.max_iterations,
375            expected_iterations: self.expected_iterations,
376            timeout_seconds: self.timeout_seconds,
377            setup: self.setup,
378            tags: self.tags,
379        }
380    }
381}
382
383/// Verification configuration for a task.
384#[derive(Debug, Clone, Default, Serialize, Deserialize)]
385pub struct Verification {
386    /// Bash command to verify task success.
387    ///
388    /// Runs in the task workspace after completion promise is detected.
389    #[serde(default)]
390    pub command: String,
391
392    /// Exit code that indicates success (default: 0).
393    #[serde(default)]
394    pub success_exit_code: i32,
395}
396
397impl Verification {
398    /// Creates a new verification with the given command.
399    pub fn new(command: impl Into<String>) -> Self {
400        Self {
401            command: command.into(),
402            success_exit_code: 0,
403        }
404    }
405
406    /// Creates a verification that expects a non-zero exit code.
407    pub fn expect_failure(command: impl Into<String>, exit_code: i32) -> Self {
408        Self {
409            command: command.into(),
410            success_exit_code: exit_code,
411        }
412    }
413}
414
415/// Setup configuration for task workspace.
416#[derive(Debug, Clone, Default, Serialize, Deserialize)]
417pub struct TaskSetup {
418    /// Script to run before the task starts.
419    ///
420    /// Executed in the task workspace directory.
421    #[serde(default)]
422    pub script: Option<String>,
423
424    /// Files to copy to the task workspace.
425    ///
426    /// Paths relative to the task suite file.
427    #[serde(default)]
428    pub files: Vec<String>,
429}
430
431impl TaskSetup {
432    /// Returns true if there is any setup to perform.
433    pub fn has_setup(&self) -> bool {
434        self.script.is_some() || !self.files.is_empty()
435    }
436}
437
438/// Errors that can occur when working with task definitions.
439#[derive(Debug, thiserror::Error)]
440pub enum TaskDefinitionError {
441    /// IO error reading task file.
442    #[error("IO error: {0}")]
443    Io(#[from] std::io::Error),
444
445    /// JSON parse error.
446    #[error("JSON parse error: {0}")]
447    Json(#[from] serde_json::Error),
448
449    /// Missing required field.
450    #[error("Missing required field: {0}")]
451    MissingField(String),
452
453    /// Validation error.
454    #[error("Validation error: {0}")]
455    Validation(String),
456}
457
458#[cfg(test)]
459mod tests {
460    use super::*;
461
462    #[test]
463    fn test_task_definition_builder() {
464        let task = TaskDefinition::builder("hello-world", "tasks/hello.md", "TASK_COMPLETE")
465            .verification_command("python hello.py | grep -q 'Hello, World!'")
466            .description("Create a hello world script")
467            .complexity("simple")
468            .max_iterations(5)
469            .expected_iterations(1)
470            .tag("python")
471            .build();
472
473        assert_eq!(task.name, "hello-world");
474        assert_eq!(task.prompt_file, "tasks/hello.md");
475        assert_eq!(task.completion_promise, "TASK_COMPLETE");
476        assert!(task.verification.command.contains("Hello, World!"));
477        assert_eq!(task.complexity, "simple");
478        assert_eq!(task.max_iterations, 5);
479        assert_eq!(task.expected_iterations, Some(1));
480        assert!(task.tags.contains(&"python".to_string()));
481    }
482
483    #[test]
484    fn test_task_definition_defaults() {
485        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
486            .verification_command("echo ok")
487            .build();
488
489        assert_eq!(task.complexity, "medium");
490        assert_eq!(task.max_iterations, 100);
491        assert_eq!(task.timeout_seconds, 300);
492        assert!(task.expected_iterations.is_none());
493        assert!(task.tags.is_empty());
494    }
495
496    #[test]
497    fn test_task_validation_valid() {
498        let task = TaskDefinition::builder("valid-task", "prompt.md", "DONE")
499            .verification_command("echo ok")
500            .build();
501
502        assert!(task.validate().is_ok());
503    }
504
505    #[test]
506    fn test_task_validation_invalid_name() {
507        let task = TaskDefinition::builder("invalid task name!", "prompt.md", "DONE")
508            .verification_command("echo ok")
509            .build();
510
511        let err = task.validate().unwrap_err();
512        assert!(matches!(err, TaskDefinitionError::Validation(_)));
513    }
514
515    #[test]
516    fn test_task_validation_empty_prompt() {
517        let task = TaskDefinition::builder("test", "", "DONE")
518            .verification_command("echo ok")
519            .build();
520
521        let err = task.validate().unwrap_err();
522        assert!(matches!(err, TaskDefinitionError::MissingField(f) if f == "prompt_file"));
523    }
524
525    #[test]
526    fn test_task_validation_empty_verification() {
527        let task = TaskDefinition::builder("test", "prompt.md", "DONE").build();
528
529        let err = task.validate().unwrap_err();
530        assert!(matches!(err, TaskDefinitionError::MissingField(f) if f == "verification.command"));
531    }
532
533    #[test]
534    fn test_task_validation_invalid_complexity() {
535        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
536            .verification_command("echo ok")
537            .complexity("invalid")
538            .build();
539
540        let err = task.validate().unwrap_err();
541        assert!(matches!(err, TaskDefinitionError::Validation(_)));
542    }
543
544    #[test]
545    fn test_iteration_delta() {
546        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
547            .verification_command("echo ok")
548            .expected_iterations(5)
549            .build();
550
551        // Took fewer iterations than expected
552        assert_eq!(task.iteration_delta(3), Some(-2));
553
554        // Took more iterations than expected
555        assert_eq!(task.iteration_delta(7), Some(2));
556
557        // Took exactly expected
558        assert_eq!(task.iteration_delta(5), Some(0));
559    }
560
561    #[test]
562    fn test_iteration_delta_no_expected() {
563        let task = TaskDefinition::builder("test", "prompt.md", "DONE")
564            .verification_command("echo ok")
565            .build();
566
567        assert!(task.iteration_delta(5).is_none());
568    }
569
570    #[test]
571    fn test_task_suite_parse() {
572        let json = r#"{
573            "tasks": [
574                {
575                    "name": "hello-world",
576                    "prompt_file": "tasks/hello/PROMPT.md",
577                    "completion_promise": "TASK_COMPLETE",
578                    "verification": {
579                        "command": "python hello.py | grep -q 'Hello, World!'"
580                    },
581                    "complexity": "simple",
582                    "max_iterations": 5,
583                    "expected_iterations": 1
584                },
585                {
586                    "name": "fizzbuzz-tdd",
587                    "description": "Implement FizzBuzz with TDD",
588                    "prompt_file": "tasks/fizzbuzz/PROMPT.md",
589                    "completion_promise": "TESTS_PASSING",
590                    "verification": {
591                        "command": "pytest test_fizzbuzz.py -v"
592                    },
593                    "complexity": "medium",
594                    "max_iterations": 15,
595                    "expected_iterations": 5,
596                    "setup": {
597                        "files": ["test_fizzbuzz.py"]
598                    },
599                    "tags": ["python", "tdd"]
600                }
601            ],
602            "metadata": {
603                "name": "Ralph Benchmark Suite",
604                "version": "1.0.0"
605            }
606        }"#;
607
608        let suite: TaskSuite = serde_json::from_str(json).unwrap();
609        assert_eq!(suite.tasks.len(), 2);
610
611        let hello = &suite.tasks[0];
612        assert_eq!(hello.name, "hello-world");
613        assert_eq!(hello.complexity, "simple");
614        assert_eq!(hello.max_iterations, 5);
615        assert_eq!(hello.expected_iterations, Some(1));
616
617        let fizzbuzz = &suite.tasks[1];
618        assert_eq!(fizzbuzz.name, "fizzbuzz-tdd");
619        assert!(fizzbuzz.description.is_some());
620        assert_eq!(fizzbuzz.setup.files.len(), 1);
621        assert!(fizzbuzz.tags.contains(&"tdd".to_string()));
622
623        assert_eq!(
624            suite.metadata.name,
625            Some("Ralph Benchmark Suite".to_string())
626        );
627    }
628
629    #[test]
630    fn test_task_suite_validation_empty() {
631        let suite = TaskSuite {
632            tasks: vec![],
633            metadata: SuiteMetadata::default(),
634        };
635
636        let err = suite.validate().unwrap_err();
637        assert!(matches!(err, TaskDefinitionError::Validation(_)));
638    }
639
640    #[test]
641    fn test_task_suite_validation_duplicates() {
642        let task = TaskDefinition::builder("duplicate", "prompt.md", "DONE")
643            .verification_command("echo ok")
644            .build();
645
646        let suite = TaskSuite {
647            tasks: vec![task.clone(), task],
648            metadata: SuiteMetadata::default(),
649        };
650
651        let err = suite.validate().unwrap_err();
652        assert!(err.to_string().contains("Duplicate task name"));
653    }
654
655    #[test]
656    fn test_filter_by_complexity() {
657        let json = r#"{
658            "tasks": [
659                {"name": "t1", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "complexity": "simple"},
660                {"name": "t2", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "complexity": "medium"},
661                {"name": "t3", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "complexity": "simple"}
662            ]
663        }"#;
664
665        let suite: TaskSuite = serde_json::from_str(json).unwrap();
666        let simple = suite.filter_by_complexity("simple");
667        assert_eq!(simple.len(), 2);
668        assert!(simple.iter().all(|t| t.complexity == "simple"));
669    }
670
671    #[test]
672    fn test_filter_by_tag() {
673        let json = r#"{
674            "tasks": [
675                {"name": "t1", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "tags": ["python", "testing"]},
676                {"name": "t2", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "tags": ["rust"]},
677                {"name": "t3", "prompt_file": "p.md", "completion_promise": "DONE", "verification": {"command": "echo ok"}, "tags": ["python"]}
678            ]
679        }"#;
680
681        let suite: TaskSuite = serde_json::from_str(json).unwrap();
682        let python = suite.filter_by_tag("python");
683        assert_eq!(python.len(), 2);
684    }
685
686    #[test]
687    fn test_setup_has_setup() {
688        let empty = TaskSetup::default();
689        assert!(!empty.has_setup());
690
691        let with_script = TaskSetup {
692            script: Some("setup.sh".to_string()),
693            files: vec![],
694        };
695        assert!(with_script.has_setup());
696
697        let with_files = TaskSetup {
698            script: None,
699            files: vec!["file.py".to_string()],
700        };
701        assert!(with_files.has_setup());
702    }
703
704    #[test]
705    fn test_verification_new() {
706        let v = Verification::new("pytest tests/");
707        assert_eq!(v.command, "pytest tests/");
708        assert_eq!(v.success_exit_code, 0);
709    }
710
711    #[test]
712    fn test_verification_expect_failure() {
713        let v = Verification::expect_failure("false", 1);
714        assert_eq!(v.command, "false");
715        assert_eq!(v.success_exit_code, 1);
716    }
717}