deepstrike_core/harness/
eval.rs

1//! Evaluation primitives — the agent's "quality gate" compute.
2//!
3//! Pure computation in kernel, I/O in SDK. This module provides the **stateless** building blocks
4//! for the generate → evaluate → retry quality gate:
5//!
6//! - [`build_eval_messages`] assembles the impartial-evaluator prompt from a goal + criteria + the
7//!   agent's output (the SDK then calls the eval LLM with it).
8//! - [`parse_verdict`] parses the LLM's JSON response into a structured [`EvalResult`].
9//! - [`verdict_output_schema`] is the JSON Schema for that verdict, used as the `output_schema` of
10//!   the eval node in the [`crate::orchestration::workflow::gen_eval`] workflow template.
11//!
12//! **History (0.5.0 fold, OS-axis #6).** This replaces the former `EvalPipeline` state machine +
13//! its public SDK class. The quality gate is now expressed on the workflow substrate: the iterative
14//! retry-with-feedback loop is driven by the SDK `HarnessLoop` (the kernel `NodeKind::Loop` re-arms
15//! a single node, so per-iteration eval cannot be a static DAG), and the declarative
16//! "loop-the-worker-then-verify-with-a-structured-verdict" shape is the `gen_eval` template. Both
17//! reuse these primitives, so the verdict shape stays consistent across the two paths.
18use crate::types::message::{Content, Message, Role};
19
20// ---------------------------------------------------------------------------
21// Input types
22// ---------------------------------------------------------------------------
23
24/// A single evaluation criterion with optional weight and required flag.
25#[derive(Debug, Clone)]
26pub struct Criterion {
27    pub text: String,
28    /// If true, failing this criterion fails the entire evaluation.
29    pub required: bool,
30    /// Relative weight for scoring (default 1.0).
31    pub weight: f32,
32}
33
34impl Criterion {
35    pub fn required(text: impl Into<String>) -> Self {
36        Self {
37            text: text.into(),
38            required: true,
39            weight: 1.0,
40        }
41    }
42
43    pub fn optional(text: impl Into<String>) -> Self {
44        Self {
45            text: text.into(),
46            required: false,
47            weight: 1.0,
48        }
49    }
50
51    pub fn with_weight(mut self, w: f32) -> Self {
52        self.weight = w;
53        self
54    }
55}
56
57impl From<String> for Criterion {
58    fn from(s: String) -> Self {
59        Self::required(s)
60    }
61}
62
63impl From<&str> for Criterion {
64    fn from(s: &str) -> Self {
65        Self::required(s)
66    }
67}
68
69// ---------------------------------------------------------------------------
70// Output types
71// ---------------------------------------------------------------------------
72
73/// Per-criterion evaluation result.
74#[derive(Debug, Clone)]
75pub struct CriterionResult {
76    pub criterion: String,
77    pub passed: bool,
78    /// 0.0–1.0 partial credit score.
79    pub score: f32,
80    pub feedback: String,
81}
82
83/// A skill distilled from a successful run — SDK writes this to `skill_dir`.
84#[derive(Debug, Clone)]
85pub struct SkillCandidate {
86    pub name: String,
87    pub description: String,
88    pub when_to_use: Option<String>,
89    /// Markdown body only (no frontmatter) — SDK assembles the full file.
90    pub content: String,
91}
92
93/// The structured verdict produced by parsing the eval LLM's JSON response.
94#[derive(Debug, Clone)]
95pub struct EvalResult {
96    pub passed: bool,
97    /// Weighted aggregate score across all criteria (0.0–1.0).
98    pub overall_score: f32,
99    /// Human-readable summary injected into the next attempt's goal.
100    pub feedback: String,
101    /// Per-criterion breakdown.
102    pub details: Vec<CriterionResult>,
103    pub skill_candidate: Option<SkillCandidate>,
104}
105
106// ---------------------------------------------------------------------------
107// Prompt builder
108// ---------------------------------------------------------------------------
109
110/// Build the impartial-evaluator messages for one attempt: a system instruction describing the
111/// scoring contract + a user message carrying the goal, criteria, and the agent's output. The SDK
112/// calls the eval LLM with these, then feeds the response to [`parse_verdict`].
113pub fn build_eval_messages(
114    goal: &str,
115    criteria: &[Criterion],
116    result: &str,
117    attempt: u32,
118    extract_skill_on_pass: bool,
119) -> Vec<Message> {
120    let criteria_text = if criteria.is_empty() {
121        "No explicit criteria — use general quality judgement.".to_string()
122    } else {
123        criteria
124            .iter()
125            .enumerate()
126            .map(|(i, c)| {
127                let tag = if c.required {
128                    "[required]"
129                } else {
130                    "[optional]"
131                };
132                let weight = if (c.weight - 1.0).abs() > 0.01 {
133                    format!(" weight={:.1}", c.weight)
134                } else {
135                    String::new()
136                };
137                format!("{}. {}{}{}", i + 1, tag, weight, c.text)
138            })
139            .collect::<Vec<_>>()
140            .join("\n")
141    };
142
143    let details_schema = r#"[{"criterion":"...","passed":bool,"score":0.0-1.0,"feedback":"..."}]"#;
144
145    let skill_instruction = if extract_skill_on_pass {
146        "\nIf passed=true and the approach is reusable, add a \"skill\" field:\
147\n{\"name\":\"snake_case\",\"description\":\"one sentence\",\"when_to_use\":\"optional hint\",\"content\":\"markdown body (no frontmatter)\"}"
148    } else {
149        ""
150    };
151
152    let system = Message {
153        role: Role::System,
154        content: Content::Text(format!(
155            "You are an impartial evaluator. Assess whether the agent's output meets the goal and criteria.\n\
156             [required] criteria must ALL pass for overall passed=true.\n\
157             [optional] criteria contribute to overall_score but do not block passing.\n\
158             Respond with JSON only:\n\
159             {{\"passed\":bool,\"overall_score\":0.0-1.0,\"feedback\":\"concise summary\",\
160             \"details\":{details_schema}{skill_instruction}}}"
161        )),
162        tool_calls: vec![],
163        token_count: None,
164    };
165
166    let user = Message {
167        role: Role::User,
168        content: Content::Text(format!(
169            "## Goal\n{goal}\n\n## Criteria\n{criteria_text}\n\n## Agent Output (attempt {attempt})\n{result}"
170        )),
171        tool_calls: vec![],
172        token_count: None,
173    };
174
175    vec![system, user]
176}
177
178// ---------------------------------------------------------------------------
179// Verdict output schema (for the gen_eval workflow template's eval node)
180// ---------------------------------------------------------------------------
181
182/// JSON Schema for the verdict an eval node must produce. Used as the `output_schema` of the eval
183/// node in the [`crate::orchestration::workflow::gen_eval`] template so the SDK can instruct +
184/// validate the verdict. Matches what [`parse_verdict`] reads.
185pub fn verdict_output_schema(extract_skill_on_pass: bool) -> serde_json::Value {
186    let mut properties = serde_json::json!({
187        "passed": { "type": "boolean", "description": "true iff all [required] criteria pass" },
188        "overall_score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
189        "feedback": { "type": "string", "description": "concise summary; on fail, what to fix next attempt" },
190        "details": {
191            "type": "array",
192            "items": {
193                "type": "object",
194                "required": ["criterion", "passed", "score", "feedback"],
195                "properties": {
196                    "criterion": { "type": "string" },
197                    "passed": { "type": "boolean" },
198                    "score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
199                    "feedback": { "type": "string" }
200                }
201            }
202        }
203    });
204    if extract_skill_on_pass {
205        properties["skill"] = serde_json::json!({
206            "type": "object",
207            "description": "optional reusable skill distilled from a passing run",
208            "required": ["name", "description", "content"],
209            "properties": {
210                "name": { "type": "string", "description": "snake_case" },
211                "description": { "type": "string" },
212                "when_to_use": { "type": "string" },
213                "content": { "type": "string", "description": "markdown body, no frontmatter" }
214            }
215        });
216    }
217    serde_json::json!({
218        "type": "object",
219        "required": ["passed", "overall_score", "feedback"],
220        "properties": properties
221    })
222}
223
224// ---------------------------------------------------------------------------
225// Response parser
226// ---------------------------------------------------------------------------
227
228/// Parse an eval LLM's JSON response into a structured [`EvalResult`]. Tolerant of markdown fences
229/// and missing fields (defaults: `passed=false`, score derived from `passed`).
230pub fn parse_verdict(content: &str) -> EvalResult {
231    let json_str = extract_json(content);
232    let v: serde_json::Value = serde_json::from_str(json_str).unwrap_or(serde_json::Value::Null);
233
234    let passed = v.get("passed").and_then(|x| x.as_bool()).unwrap_or(false);
235    let overall_score = v
236        .get("overall_score")
237        .and_then(|x| x.as_f64())
238        .map(|f| f as f32)
239        .unwrap_or(if passed { 1.0 } else { 0.0 });
240    let feedback = v
241        .get("feedback")
242        .and_then(|x| x.as_str())
243        .unwrap_or("No feedback provided.")
244        .to_string();
245
246    let details = v
247        .get("details")
248        .and_then(|d| d.as_array())
249        .map(|arr| {
250            arr.iter()
251                .filter_map(|item| {
252                    let criterion = item.get("criterion")?.as_str()?.to_string();
253                    let item_passed = item
254                        .get("passed")
255                        .and_then(|x| x.as_bool())
256                        .unwrap_or(false);
257                    let score = item
258                        .get("score")
259                        .and_then(|x| x.as_f64())
260                        .map(|f| f as f32)
261                        .unwrap_or(if item_passed { 1.0 } else { 0.0 });
262                    let item_feedback = item
263                        .get("feedback")
264                        .and_then(|x| x.as_str())
265                        .unwrap_or("")
266                        .to_string();
267                    Some(CriterionResult {
268                        criterion,
269                        passed: item_passed,
270                        score,
271                        feedback: item_feedback,
272                    })
273                })
274                .collect()
275        })
276        .unwrap_or_default();
277
278    let skill_candidate = v.get("skill").and_then(|s| {
279        let name = s.get("name")?.as_str()?.to_string();
280        let description = s.get("description")?.as_str()?.to_string();
281        let content = s.get("content")?.as_str()?.to_string();
282        if name.is_empty() {
283            return None;
284        }
285        let when_to_use = s
286            .get("when_to_use")
287            .and_then(|x| x.as_str())
288            .filter(|x| !x.is_empty())
289            .map(|x| x.to_string());
290        Some(SkillCandidate {
291            name,
292            description,
293            when_to_use,
294            content,
295        })
296    });
297
298    EvalResult {
299        passed,
300        overall_score,
301        feedback,
302        details,
303        skill_candidate,
304    }
305}
306
307fn extract_json(s: &str) -> &str {
308    // Strip ```json ... ``` fences if present.
309    if let Some(start) = s.find('{') {
310        if let Some(end) = s.rfind('}') {
311            return &s[start..=end];
312        }
313    }
314    s
315}
316
317// ---------------------------------------------------------------------------
318// Tests
319// ---------------------------------------------------------------------------
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    #[test]
326    fn build_eval_messages_carries_goal_and_criteria() {
327        let msgs = build_eval_messages(
328            "Write a function",
329            &[Criterion::required("Must handle errors")],
330            "fn foo() {}",
331            1,
332            true,
333        );
334        assert_eq!(msgs.len(), 2);
335        assert!(matches!(msgs[0].role, Role::System));
336        let Content::Text(user) = &msgs[1].content else {
337            panic!("expected text")
338        };
339        assert!(user.contains("Write a function"));
340        assert!(user.contains("[required]Must handle errors"));
341        assert!(user.contains("attempt 1"));
342        // skill instruction present when extract_skill_on_pass=true
343        let Content::Text(system) = &msgs[0].content else {
344            panic!("expected text")
345        };
346        assert!(system.contains("\"skill\""));
347    }
348
349    #[test]
350    fn build_eval_messages_omits_skill_instruction_when_disabled() {
351        let msgs = build_eval_messages("g", &[], "r", 1, false);
352        let Content::Text(system) = &msgs[0].content else {
353            panic!("expected text")
354        };
355        assert!(!system.contains("\"name\":\"snake_case\""));
356    }
357
358    #[test]
359    fn parse_verdict_failed_no_skill() {
360        let result = parse_verdict(
361            r#"{"passed":false,"overall_score":0.2,"feedback":"Missing error handling","details":[{"criterion":"Must handle errors","passed":false,"score":0.2,"feedback":"No error handling found"}]}"#,
362        );
363        assert!(!result.passed);
364        assert_eq!(result.feedback, "Missing error handling");
365        assert_eq!(result.details.len(), 1);
366        assert!(!result.details[0].passed);
367        assert!(result.skill_candidate.is_none());
368    }
369
370    #[test]
371    fn parse_verdict_passed_with_skill_and_details() {
372        let json = r#"{"passed":true,"overall_score":0.95,"feedback":"All criteria met","details":[{"criterion":"Must handle errors","passed":true,"score":1.0,"feedback":"Good error handling"}],"skill":{"name":"robust_api_call","description":"How to call APIs with retries","content":"Robust API Call - Always retry on 5xx."}}"#;
373        let result = parse_verdict(json);
374        assert!(result.passed);
375        assert!(result.overall_score > 0.9);
376        assert_eq!(result.details.len(), 1);
377        assert!(result.details[0].passed);
378        let skill = result.skill_candidate.unwrap();
379        assert_eq!(skill.name, "robust_api_call");
380        assert!(skill.content.contains("retry"));
381    }
382
383    #[test]
384    fn parse_verdict_strips_markdown_fences() {
385        let result = parse_verdict("```json\n{\"passed\":true,\"feedback\":\"good\"}\n```");
386        assert!(result.passed);
387    }
388
389    #[test]
390    fn criterion_from_string_is_required() {
391        let c = Criterion::from("some check");
392        assert!(c.required);
393        assert!((c.weight - 1.0).abs() < 0.001);
394    }
395
396    #[test]
397    fn optional_criterion_with_weight() {
398        let c = Criterion::optional("bonus check").with_weight(0.5);
399        assert!(!c.required);
400        assert!((c.weight - 0.5).abs() < 0.001);
401    }
402
403    #[test]
404    fn verdict_output_schema_shape() {
405        let schema = verdict_output_schema(true);
406        assert_eq!(schema["type"], "object");
407        assert!(schema["properties"]["passed"].is_object());
408        assert!(schema["properties"]["overall_score"].is_object());
409        assert!(schema["properties"]["details"].is_object());
410        assert!(schema["properties"]["skill"].is_object());
411        // skill property dropped when extraction is disabled
412        let no_skill = verdict_output_schema(false);
413        assert!(no_skill["properties"]["skill"].is_null());
414    }
415}
deepstrike_core/harness/eval.rs

deepstrike_core/harness/
eval.rs