deepstrike-core 0.2.35

Cross-language agent runtime kernel — pure computation, zero I/O
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
//! Evaluation primitives — the agent's "quality gate" compute.
//!
//! Pure computation in kernel, I/O in SDK. This module provides the **stateless** building blocks
//! for the generate → evaluate → retry quality gate:
//!
//! - [`build_eval_messages`] assembles the impartial-evaluator prompt from a goal + criteria + the
//!   agent's output (the SDK then calls the eval LLM with it).
//! - [`parse_verdict`] parses the LLM's JSON response into a structured [`EvalResult`].
//! - [`verdict_output_schema`] is the JSON Schema for that verdict, used as the `output_schema` of
//!   the eval node in the [`crate::orchestration::workflow::gen_eval`] workflow template.
//!
//! **History (0.5.0 fold, OS-axis #6).** This replaces the former `EvalPipeline` state machine +
//! its public SDK class. The quality gate is now expressed on the workflow substrate: the iterative
//! retry-with-feedback loop is driven by the SDK `HarnessLoop` (the kernel `NodeKind::Loop` re-arms
//! a single node, so per-iteration eval cannot be a static DAG), and the declarative
//! "loop-the-worker-then-verify-with-a-structured-verdict" shape is the `gen_eval` template. Both
//! reuse these primitives, so the verdict shape stays consistent across the two paths.
use crate::types::message::{Content, Message, Role};

// ---------------------------------------------------------------------------
// Input types
// ---------------------------------------------------------------------------

/// A single evaluation criterion with optional weight and required flag.
#[derive(Debug, Clone)]
pub struct Criterion {
    pub text: String,
    /// If true, failing this criterion fails the entire evaluation.
    pub required: bool,
    /// Relative weight for scoring (default 1.0).
    pub weight: f32,
}

impl Criterion {
    pub fn required(text: impl Into<String>) -> Self {
        Self {
            text: text.into(),
            required: true,
            weight: 1.0,
        }
    }

    pub fn optional(text: impl Into<String>) -> Self {
        Self {
            text: text.into(),
            required: false,
            weight: 1.0,
        }
    }

    pub fn with_weight(mut self, w: f32) -> Self {
        self.weight = w;
        self
    }
}

impl From<String> for Criterion {
    fn from(s: String) -> Self {
        Self::required(s)
    }
}

impl From<&str> for Criterion {
    fn from(s: &str) -> Self {
        Self::required(s)
    }
}

// ---------------------------------------------------------------------------
// Output types
// ---------------------------------------------------------------------------

/// Per-criterion evaluation result.
#[derive(Debug, Clone)]
pub struct CriterionResult {
    pub criterion: String,
    pub passed: bool,
    /// 0.0–1.0 partial credit score.
    pub score: f32,
    pub feedback: String,
}

/// A skill distilled from a successful run — SDK writes this to `skill_dir`.
#[derive(Debug, Clone)]
pub struct SkillCandidate {
    pub name: String,
    pub description: String,
    pub when_to_use: Option<String>,
    /// Markdown body only (no frontmatter) — SDK assembles the full file.
    pub content: String,
}

/// The structured verdict produced by parsing the eval LLM's JSON response.
#[derive(Debug, Clone)]
pub struct EvalResult {
    pub passed: bool,
    /// Weighted aggregate score across all criteria (0.0–1.0).
    pub overall_score: f32,
    /// Human-readable summary injected into the next attempt's goal.
    pub feedback: String,
    /// Per-criterion breakdown.
    pub details: Vec<CriterionResult>,
    pub skill_candidate: Option<SkillCandidate>,
}

// ---------------------------------------------------------------------------
// Prompt builder
// ---------------------------------------------------------------------------

/// Build the impartial-evaluator messages for one attempt: a system instruction describing the
/// scoring contract + a user message carrying the goal, criteria, and the agent's output. The SDK
/// calls the eval LLM with these, then feeds the response to [`parse_verdict`].
pub fn build_eval_messages(
    goal: &str,
    criteria: &[Criterion],
    result: &str,
    attempt: u32,
    extract_skill_on_pass: bool,
) -> Vec<Message> {
    let criteria_text = if criteria.is_empty() {
        "No explicit criteria — use general quality judgement.".to_string()
    } else {
        criteria
            .iter()
            .enumerate()
            .map(|(i, c)| {
                let tag = if c.required {
                    "[required]"
                } else {
                    "[optional]"
                };
                let weight = if (c.weight - 1.0).abs() > 0.01 {
                    format!(" weight={:.1}", c.weight)
                } else {
                    String::new()
                };
                format!("{}. {}{}{}", i + 1, tag, weight, c.text)
            })
            .collect::<Vec<_>>()
            .join("\n")
    };

    let details_schema = r#"[{"criterion":"...","passed":bool,"score":0.0-1.0,"feedback":"..."}]"#;

    let skill_instruction = if extract_skill_on_pass {
        "\nIf passed=true and the approach is reusable, add a \"skill\" field:\
\n{\"name\":\"snake_case\",\"description\":\"one sentence\",\"when_to_use\":\"optional hint\",\"content\":\"markdown body (no frontmatter)\"}"
    } else {
        ""
    };

    let system = Message {
        role: Role::System,
        content: Content::Text(format!(
            "You are an impartial evaluator. Assess whether the agent's output meets the goal and criteria.\n\
             [required] criteria must ALL pass for overall passed=true.\n\
             [optional] criteria contribute to overall_score but do not block passing.\n\
             Respond with JSON only:\n\
             {{\"passed\":bool,\"overall_score\":0.0-1.0,\"feedback\":\"concise summary\",\
             \"details\":{details_schema}{skill_instruction}}}"
        )),
        tool_calls: vec![],
        token_count: None,
    };

    let user = Message {
        role: Role::User,
        content: Content::Text(format!(
            "## Goal\n{goal}\n\n## Criteria\n{criteria_text}\n\n## Agent Output (attempt {attempt})\n{result}"
        )),
        tool_calls: vec![],
        token_count: None,
    };

    vec![system, user]
}

// ---------------------------------------------------------------------------
// Verdict output schema (for the gen_eval workflow template's eval node)
// ---------------------------------------------------------------------------

/// JSON Schema for the verdict an eval node must produce. Used as the `output_schema` of the eval
/// node in the [`crate::orchestration::workflow::gen_eval`] template so the SDK can instruct +
/// validate the verdict. Matches what [`parse_verdict`] reads.
pub fn verdict_output_schema(extract_skill_on_pass: bool) -> serde_json::Value {
    let mut properties = serde_json::json!({
        "passed": { "type": "boolean", "description": "true iff all [required] criteria pass" },
        "overall_score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
        "feedback": { "type": "string", "description": "concise summary; on fail, what to fix next attempt" },
        "details": {
            "type": "array",
            "items": {
                "type": "object",
                "required": ["criterion", "passed", "score", "feedback"],
                "properties": {
                    "criterion": { "type": "string" },
                    "passed": { "type": "boolean" },
                    "score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
                    "feedback": { "type": "string" }
                }
            }
        }
    });
    if extract_skill_on_pass {
        properties["skill"] = serde_json::json!({
            "type": "object",
            "description": "optional reusable skill distilled from a passing run",
            "required": ["name", "description", "content"],
            "properties": {
                "name": { "type": "string", "description": "snake_case" },
                "description": { "type": "string" },
                "when_to_use": { "type": "string" },
                "content": { "type": "string", "description": "markdown body, no frontmatter" }
            }
        });
    }
    serde_json::json!({
        "type": "object",
        "required": ["passed", "overall_score", "feedback"],
        "properties": properties
    })
}

// ---------------------------------------------------------------------------
// Response parser
// ---------------------------------------------------------------------------

/// Parse an eval LLM's JSON response into a structured [`EvalResult`]. Tolerant of markdown fences
/// and missing fields (defaults: `passed=false`, score derived from `passed`).
pub fn parse_verdict(content: &str) -> EvalResult {
    let json_str = extract_json(content);
    let v: serde_json::Value = serde_json::from_str(json_str).unwrap_or(serde_json::Value::Null);

    let passed = v.get("passed").and_then(|x| x.as_bool()).unwrap_or(false);
    let overall_score = v
        .get("overall_score")
        .and_then(|x| x.as_f64())
        .map(|f| f as f32)
        .unwrap_or(if passed { 1.0 } else { 0.0 });
    let feedback = v
        .get("feedback")
        .and_then(|x| x.as_str())
        .unwrap_or("No feedback provided.")
        .to_string();

    let details = v
        .get("details")
        .and_then(|d| d.as_array())
        .map(|arr| {
            arr.iter()
                .filter_map(|item| {
                    let criterion = item.get("criterion")?.as_str()?.to_string();
                    let item_passed = item
                        .get("passed")
                        .and_then(|x| x.as_bool())
                        .unwrap_or(false);
                    let score = item
                        .get("score")
                        .and_then(|x| x.as_f64())
                        .map(|f| f as f32)
                        .unwrap_or(if item_passed { 1.0 } else { 0.0 });
                    let item_feedback = item
                        .get("feedback")
                        .and_then(|x| x.as_str())
                        .unwrap_or("")
                        .to_string();
                    Some(CriterionResult {
                        criterion,
                        passed: item_passed,
                        score,
                        feedback: item_feedback,
                    })
                })
                .collect()
        })
        .unwrap_or_default();

    let skill_candidate = v.get("skill").and_then(|s| {
        let name = s.get("name")?.as_str()?.to_string();
        let description = s.get("description")?.as_str()?.to_string();
        let content = s.get("content")?.as_str()?.to_string();
        if name.is_empty() {
            return None;
        }
        let when_to_use = s
            .get("when_to_use")
            .and_then(|x| x.as_str())
            .filter(|x| !x.is_empty())
            .map(|x| x.to_string());
        Some(SkillCandidate {
            name,
            description,
            when_to_use,
            content,
        })
    });

    EvalResult {
        passed,
        overall_score,
        feedback,
        details,
        skill_candidate,
    }
}

fn extract_json(s: &str) -> &str {
    // Strip ```json ... ``` fences if present.
    if let Some(start) = s.find('{') {
        if let Some(end) = s.rfind('}') {
            return &s[start..=end];
        }
    }
    s
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn build_eval_messages_carries_goal_and_criteria() {
        let msgs = build_eval_messages(
            "Write a function",
            &[Criterion::required("Must handle errors")],
            "fn foo() {}",
            1,
            true,
        );
        assert_eq!(msgs.len(), 2);
        assert!(matches!(msgs[0].role, Role::System));
        let Content::Text(user) = &msgs[1].content else {
            panic!("expected text")
        };
        assert!(user.contains("Write a function"));
        assert!(user.contains("[required]Must handle errors"));
        assert!(user.contains("attempt 1"));
        // skill instruction present when extract_skill_on_pass=true
        let Content::Text(system) = &msgs[0].content else {
            panic!("expected text")
        };
        assert!(system.contains("\"skill\""));
    }

    #[test]
    fn build_eval_messages_omits_skill_instruction_when_disabled() {
        let msgs = build_eval_messages("g", &[], "r", 1, false);
        let Content::Text(system) = &msgs[0].content else {
            panic!("expected text")
        };
        assert!(!system.contains("\"name\":\"snake_case\""));
    }

    #[test]
    fn parse_verdict_failed_no_skill() {
        let result = parse_verdict(
            r#"{"passed":false,"overall_score":0.2,"feedback":"Missing error handling","details":[{"criterion":"Must handle errors","passed":false,"score":0.2,"feedback":"No error handling found"}]}"#,
        );
        assert!(!result.passed);
        assert_eq!(result.feedback, "Missing error handling");
        assert_eq!(result.details.len(), 1);
        assert!(!result.details[0].passed);
        assert!(result.skill_candidate.is_none());
    }

    #[test]
    fn parse_verdict_passed_with_skill_and_details() {
        let json = r#"{"passed":true,"overall_score":0.95,"feedback":"All criteria met","details":[{"criterion":"Must handle errors","passed":true,"score":1.0,"feedback":"Good error handling"}],"skill":{"name":"robust_api_call","description":"How to call APIs with retries","content":"Robust API Call - Always retry on 5xx."}}"#;
        let result = parse_verdict(json);
        assert!(result.passed);
        assert!(result.overall_score > 0.9);
        assert_eq!(result.details.len(), 1);
        assert!(result.details[0].passed);
        let skill = result.skill_candidate.unwrap();
        assert_eq!(skill.name, "robust_api_call");
        assert!(skill.content.contains("retry"));
    }

    #[test]
    fn parse_verdict_strips_markdown_fences() {
        let result = parse_verdict("```json\n{\"passed\":true,\"feedback\":\"good\"}\n```");
        assert!(result.passed);
    }

    #[test]
    fn criterion_from_string_is_required() {
        let c = Criterion::from("some check");
        assert!(c.required);
        assert!((c.weight - 1.0).abs() < 0.001);
    }

    #[test]
    fn optional_criterion_with_weight() {
        let c = Criterion::optional("bonus check").with_weight(0.5);
        assert!(!c.required);
        assert!((c.weight - 0.5).abs() < 0.001);
    }

    #[test]
    fn verdict_output_schema_shape() {
        let schema = verdict_output_schema(true);
        assert_eq!(schema["type"], "object");
        assert!(schema["properties"]["passed"].is_object());
        assert!(schema["properties"]["overall_score"].is_object());
        assert!(schema["properties"]["details"].is_object());
        assert!(schema["properties"]["skill"].is_object());
        // skill property dropped when extraction is disabled
        let no_skill = verdict_output_schema(false);
        assert!(no_skill["properties"]["skill"].is_null());
    }
}