Skip to main content

harn_vm/
orchestration.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3use std::rc::Rc;
4use std::{cell::RefCell, thread_local};
5
6use serde::{Deserialize, Serialize};
7
8use crate::llm::{extract_llm_options, vm_call_llm_full, vm_value_to_json};
9use crate::value::{VmError, VmValue};
10
11fn now_rfc3339() -> String {
12    use std::time::{SystemTime, UNIX_EPOCH};
13    let ts = SystemTime::now()
14        .duration_since(UNIX_EPOCH)
15        .unwrap_or_default()
16        .as_secs();
17    format!("{ts}")
18}
19
20fn new_id(prefix: &str) -> String {
21    format!("{prefix}_{}", uuid::Uuid::now_v7())
22}
23
24fn default_run_dir() -> PathBuf {
25    std::env::var("HARN_RUN_DIR")
26        .map(PathBuf::from)
27        .unwrap_or_else(|_| PathBuf::from(".harn-runs"))
28}
29
30thread_local! {
31    static EXECUTION_POLICY_STACK: RefCell<Vec<CapabilityPolicy>> = const { RefCell::new(Vec::new()) };
32    static TOOL_HOOKS: RefCell<Vec<ToolHook>> = const { RefCell::new(Vec::new()) };
33    static CURRENT_MUTATION_SESSION: RefCell<Option<MutationSessionRecord>> = const { RefCell::new(None) };
34}
35
36#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
37#[serde(default)]
38pub struct MutationSessionRecord {
39    pub session_id: String,
40    pub parent_session_id: Option<String>,
41    pub run_id: Option<String>,
42    pub worker_id: Option<String>,
43    pub execution_kind: Option<String>,
44    pub mutation_scope: String,
45    pub approval_mode: String,
46}
47
48impl MutationSessionRecord {
49    pub fn normalize(mut self) -> Self {
50        if self.session_id.is_empty() {
51            self.session_id = new_id("session");
52        }
53        if self.mutation_scope.is_empty() {
54            self.mutation_scope = "read_only".to_string();
55        }
56        if self.approval_mode.is_empty() {
57            self.approval_mode = "host_enforced".to_string();
58        }
59        self
60    }
61}
62
63pub fn install_current_mutation_session(session: Option<MutationSessionRecord>) {
64    CURRENT_MUTATION_SESSION.with(|slot| {
65        *slot.borrow_mut() = session.map(MutationSessionRecord::normalize);
66    });
67}
68
69pub fn current_mutation_session() -> Option<MutationSessionRecord> {
70    CURRENT_MUTATION_SESSION.with(|slot| slot.borrow().clone())
71}
72
73// ── Tool lifecycle hooks ──────────────────────────────────────────────
74
75/// Action returned by a PreToolUse hook.
76#[derive(Clone, Debug)]
77pub enum PreToolAction {
78    /// Allow the tool call to proceed unchanged.
79    Allow,
80    /// Deny the tool call with an explanation.
81    Deny(String),
82    /// Allow but replace the arguments.
83    Modify(serde_json::Value),
84}
85
86/// Action returned by a PostToolUse hook.
87#[derive(Clone, Debug)]
88pub enum PostToolAction {
89    /// Pass the result through unchanged.
90    Pass,
91    /// Replace the result text.
92    Modify(String),
93}
94
95/// Callback types for tool lifecycle hooks.
96pub type PreToolHookFn = Rc<dyn Fn(&str, &serde_json::Value) -> PreToolAction>;
97pub type PostToolHookFn = Rc<dyn Fn(&str, &str) -> PostToolAction>;
98
99/// A registered tool hook with a name pattern and callbacks.
100#[derive(Clone)]
101pub struct ToolHook {
102    /// Glob-style pattern matched against tool names (e.g. `"*"`, `"exec*"`, `"read_file"`).
103    pub pattern: String,
104    /// Called before tool execution. Return `Deny` to reject, `Modify` to rewrite args.
105    pub pre: Option<PreToolHookFn>,
106    /// Called after tool execution with the result text. Return `Modify` to rewrite.
107    pub post: Option<PostToolHookFn>,
108}
109
110impl std::fmt::Debug for ToolHook {
111    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112        f.debug_struct("ToolHook")
113            .field("pattern", &self.pattern)
114            .field("has_pre", &self.pre.is_some())
115            .field("has_post", &self.post.is_some())
116            .finish()
117    }
118}
119
120fn glob_match(pattern: &str, name: &str) -> bool {
121    if pattern == "*" {
122        return true;
123    }
124    if let Some(prefix) = pattern.strip_suffix('*') {
125        return name.starts_with(prefix);
126    }
127    if let Some(suffix) = pattern.strip_prefix('*') {
128        return name.ends_with(suffix);
129    }
130    pattern == name
131}
132
133pub fn register_tool_hook(hook: ToolHook) {
134    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().push(hook));
135}
136
137pub fn clear_tool_hooks() {
138    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().clear());
139}
140
141/// Run all matching PreToolUse hooks. Returns the final action.
142pub fn run_pre_tool_hooks(tool_name: &str, args: &serde_json::Value) -> PreToolAction {
143    TOOL_HOOKS.with(|hooks| {
144        let hooks = hooks.borrow();
145        let mut current_args = args.clone();
146        for hook in hooks.iter() {
147            if !glob_match(&hook.pattern, tool_name) {
148                continue;
149            }
150            if let Some(ref pre) = hook.pre {
151                match pre(tool_name, &current_args) {
152                    PreToolAction::Allow => {}
153                    PreToolAction::Deny(reason) => return PreToolAction::Deny(reason),
154                    PreToolAction::Modify(new_args) => {
155                        current_args = new_args;
156                    }
157                }
158            }
159        }
160        if current_args != *args {
161            PreToolAction::Modify(current_args)
162        } else {
163            PreToolAction::Allow
164        }
165    })
166}
167
168/// Run all matching PostToolUse hooks. Returns the (possibly modified) result.
169pub fn run_post_tool_hooks(tool_name: &str, result: &str) -> String {
170    TOOL_HOOKS.with(|hooks| {
171        let hooks = hooks.borrow();
172        let mut current = result.to_string();
173        for hook in hooks.iter() {
174            if !glob_match(&hook.pattern, tool_name) {
175                continue;
176            }
177            if let Some(ref post) = hook.post {
178                match post(tool_name, &current) {
179                    PostToolAction::Pass => {}
180                    PostToolAction::Modify(new_result) => {
181                        current = new_result;
182                    }
183                }
184            }
185        }
186        current
187    })
188}
189
190// ── Auto-compaction ───────────────────────────────────────────────────
191
192#[derive(Clone, Debug, PartialEq, Eq)]
193pub enum CompactStrategy {
194    Llm,
195    Truncate,
196    Custom,
197}
198
199pub fn parse_compact_strategy(value: &str) -> Result<CompactStrategy, VmError> {
200    match value {
201        "llm" => Ok(CompactStrategy::Llm),
202        "truncate" => Ok(CompactStrategy::Truncate),
203        "custom" => Ok(CompactStrategy::Custom),
204        other => Err(VmError::Runtime(format!(
205            "unknown compact_strategy '{other}' (expected 'llm', 'truncate', or 'custom')"
206        ))),
207    }
208}
209
210/// Configuration for automatic transcript compaction in agent loops.
211#[derive(Clone, Debug)]
212pub struct AutoCompactConfig {
213    /// Maximum estimated tokens before triggering auto-compaction.
214    pub token_threshold: usize,
215    /// Maximum character length for a single tool result before microcompaction.
216    pub tool_output_max_chars: usize,
217    /// Number of recent messages to keep during auto-compaction.
218    pub keep_last: usize,
219    /// Strategy used to summarize archived messages.
220    pub compact_strategy: CompactStrategy,
221    /// Optional Harn callback used when `compact_strategy` is `custom`.
222    pub custom_compactor: Option<VmValue>,
223}
224
225impl Default for AutoCompactConfig {
226    fn default() -> Self {
227        Self {
228            token_threshold: 80_000,
229            tool_output_max_chars: 20_000,
230            keep_last: 8,
231            compact_strategy: CompactStrategy::Llm,
232            custom_compactor: None,
233        }
234    }
235}
236
237/// Estimate token count from a list of JSON messages (chars / 4 heuristic).
238pub fn estimate_message_tokens(messages: &[serde_json::Value]) -> usize {
239    messages
240        .iter()
241        .map(|m| {
242            m.get("content")
243                .and_then(|c| c.as_str())
244                .map(|s| s.len())
245                .unwrap_or(0)
246        })
247        .sum::<usize>()
248        / 4
249}
250
251/// Microcompact a tool result: if it exceeds `max_chars`, keep the first and
252/// last portions with a snip marker in between.
253pub fn microcompact_tool_output(output: &str, max_chars: usize) -> String {
254    if output.len() <= max_chars || max_chars < 200 {
255        return output.to_string();
256    }
257    let diagnostic_lines = output
258        .lines()
259        .filter(|line| {
260            let trimmed = line.trim();
261            let lower = trimmed.to_lowercase();
262            // file:line pattern (e.g. "src/main.rs:42:" or "foo.go:10:5:")
263            let has_file_line = {
264                let bytes = trimmed.as_bytes();
265                let mut i = 0;
266                let mut found_colon = false;
267                while i < bytes.len() {
268                    if bytes[i] == b':' {
269                        found_colon = true;
270                        break;
271                    }
272                    i += 1;
273                }
274                found_colon && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit()
275            };
276            // keyword-based diagnostics
277            let has_keyword = trimmed.contains("error")
278                || trimmed.contains("FAIL")
279                || trimmed.contains("panic")
280                || trimmed.contains("undefined")
281                || trimmed.contains("expected")
282                || trimmed.contains("got")
283                || lower.contains("cannot find")
284                || lower.contains("not found")
285                || lower.contains("no such")
286                || lower.contains("unresolved")
287                || lower.contains("missing")
288                || lower.contains("declared but not used")
289                || lower.contains("unused")
290                || lower.contains("mismatch");
291            let positional = lower.contains(" error ")
292                || lower.starts_with("error:")
293                || lower.starts_with("fail")
294                || lower.contains("panic:");
295            (has_file_line && has_keyword) || positional
296        })
297        .take(32)
298        .collect::<Vec<_>>();
299    if !diagnostic_lines.is_empty() {
300        let diagnostics = diagnostic_lines.join("\n");
301        let budget = max_chars.saturating_sub(diagnostics.len() + 64);
302        let keep = budget / 2;
303        if keep >= 80 && output.len() > keep * 2 {
304            let head = &output[..keep];
305            let tail = &output[output.len() - keep..];
306            return format!(
307                "{head}\n\n[diagnostic lines preserved]\n{diagnostics}\n\n[... output compacted ...]\n\n{tail}"
308            );
309        }
310    }
311    let keep = max_chars / 2;
312    let head = &output[..keep];
313    let tail = &output[output.len() - keep..];
314    let snipped = output.len() - max_chars;
315    format!("{head}\n\n[... {snipped} characters snipped ...]\n\n{tail}")
316}
317
318fn format_compaction_messages(messages: &[serde_json::Value]) -> String {
319    messages
320        .iter()
321        .map(|msg| {
322            let role = msg
323                .get("role")
324                .and_then(|v| v.as_str())
325                .unwrap_or("user")
326                .to_uppercase();
327            let content = msg
328                .get("content")
329                .and_then(|v| v.as_str())
330                .unwrap_or_default();
331            format!("{role}: {content}")
332        })
333        .collect::<Vec<_>>()
334        .join("\n")
335}
336
337fn truncate_compaction_summary(
338    old_messages: &[serde_json::Value],
339    archived_count: usize,
340) -> String {
341    let summary_parts: Vec<String> = old_messages
342        .iter()
343        .filter_map(|m| {
344            let role = m.get("role")?.as_str()?;
345            let content = m.get("content")?.as_str()?;
346            if content.is_empty() {
347                return None;
348            }
349            let truncated = if content.len() > 500 {
350                format!("{}...", &content[..500])
351            } else {
352                content.to_string()
353            };
354            Some(format!("[{role}] {truncated}"))
355        })
356        .take(15)
357        .collect();
358    format!(
359        "[auto-compacted {archived_count} older messages]\n{}{}",
360        summary_parts.join("\n"),
361        if archived_count > 15 {
362            format!("\n... and {} more", archived_count - 15)
363        } else {
364            String::new()
365        }
366    )
367}
368
369fn compact_summary_text_from_value(value: &VmValue) -> Result<String, VmError> {
370    if let Some(map) = value.as_dict() {
371        if let Some(summary) = map.get("summary").or_else(|| map.get("text")) {
372            return Ok(summary.display());
373        }
374    }
375    match value {
376        VmValue::String(text) => Ok(text.to_string()),
377        VmValue::Nil => Ok(String::new()),
378        _ => serde_json::to_string_pretty(&vm_value_to_json(value))
379            .map_err(|e| VmError::Runtime(format!("custom compactor encode error: {e}"))),
380    }
381}
382
383async fn llm_compaction_summary(
384    old_messages: &[serde_json::Value],
385    archived_count: usize,
386    llm_opts: &crate::llm::api::LlmCallOptions,
387) -> Result<String, VmError> {
388    let mut compact_opts = llm_opts.clone();
389    let formatted = format_compaction_messages(old_messages);
390    compact_opts.system = None;
391    compact_opts.transcript_id = None;
392    compact_opts.transcript_summary = None;
393    compact_opts.transcript_metadata = None;
394    compact_opts.native_tools = None;
395    compact_opts.tool_choice = None;
396    compact_opts.response_format = None;
397    compact_opts.json_schema = None;
398    compact_opts.messages = vec![serde_json::json!({
399        "role": "user",
400        "content": format!(
401            "Summarize these archived conversation messages for a follow-on coding agent. Preserve goals, constraints, decisions, completed tool work, unresolved issues, and next actions. Output only the summary text.\n\nArchived message count: {archived_count}\n\nConversation:\n{formatted}"
402        ),
403    })];
404    let result = vm_call_llm_full(&compact_opts).await?;
405    let summary = result.text.trim();
406    if summary.is_empty() {
407        Ok(truncate_compaction_summary(old_messages, archived_count))
408    } else {
409        Ok(format!(
410            "[auto-compacted {archived_count} older messages]\n{summary}"
411        ))
412    }
413}
414
415async fn custom_compaction_summary(
416    old_messages: &[serde_json::Value],
417    archived_count: usize,
418    callback: &VmValue,
419) -> Result<String, VmError> {
420    let Some(VmValue::Closure(closure)) = Some(callback.clone()) else {
421        return Err(VmError::Runtime(
422            "compact_callback must be a closure when compact_strategy is 'custom'".to_string(),
423        ));
424    };
425    let mut vm = crate::vm::take_async_builtin_child_vm().ok_or_else(|| {
426        VmError::Runtime(
427            "custom transcript compaction requires an async builtin VM context".to_string(),
428        )
429    })?;
430    let messages_vm = VmValue::List(Rc::new(
431        old_messages
432            .iter()
433            .map(crate::stdlib::json_to_vm_value)
434            .collect(),
435    ));
436    let result = vm.call_closure_pub(&closure, &[messages_vm], &[]).await;
437    crate::vm::restore_async_builtin_child_vm(vm);
438    let summary = compact_summary_text_from_value(&result?)?;
439    if summary.trim().is_empty() {
440        Ok(truncate_compaction_summary(old_messages, archived_count))
441    } else {
442        Ok(format!(
443            "[auto-compacted {archived_count} older messages]\n{summary}"
444        ))
445    }
446}
447
448/// Auto-compact a message list in place: summarize older messages into a
449/// note and keep the most recent `keep_last` messages.
450pub(crate) async fn auto_compact_messages(
451    messages: &mut Vec<serde_json::Value>,
452    config: &AutoCompactConfig,
453    llm_opts: Option<&crate::llm::api::LlmCallOptions>,
454) -> Result<bool, VmError> {
455    if messages.len() <= config.keep_last {
456        return Ok(false);
457    }
458    let split_at = messages.len().saturating_sub(config.keep_last);
459    let old_messages: Vec<_> = messages.drain(..split_at).collect();
460    let archived_count = old_messages.len();
461    let summary = match config.compact_strategy {
462        CompactStrategy::Truncate => truncate_compaction_summary(&old_messages, archived_count),
463        CompactStrategy::Llm => {
464            llm_compaction_summary(
465                &old_messages,
466                archived_count,
467                llm_opts.ok_or_else(|| {
468                    VmError::Runtime(
469                        "LLM transcript compaction requires active LLM call options".to_string(),
470                    )
471                })?,
472            )
473            .await?
474        }
475        CompactStrategy::Custom => {
476            custom_compaction_summary(
477                &old_messages,
478                archived_count,
479                config.custom_compactor.as_ref().ok_or_else(|| {
480                    VmError::Runtime(
481                        "compact_callback is required when compact_strategy is 'custom'"
482                            .to_string(),
483                    )
484                })?,
485            )
486            .await?
487        }
488    };
489    messages.insert(
490        0,
491        serde_json::json!({
492            "role": "user",
493            "content": summary,
494        }),
495    );
496    Ok(true)
497}
498
499// ── Adaptive context assembly ─────────────────────────────────────────
500
501/// Snip an artifact's text to fit within a token budget.
502pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
503    let max_chars = max_tokens * 4;
504    if let Some(ref text) = artifact.text {
505        if text.len() > max_chars && max_chars >= 200 {
506            artifact.text = Some(microcompact_tool_output(text, max_chars));
507            artifact.estimated_tokens = Some(max_tokens);
508        }
509    }
510}
511
512/// Deduplicate artifacts by removing those with identical text content,
513/// keeping the one with higher priority.
514pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
515    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
516    artifacts.retain(|artifact| {
517        let text = artifact.text.as_deref().unwrap_or("");
518        if text.is_empty() {
519            return true;
520        }
521        // Simple hash for dedup
522        let hash = {
523            use std::hash::{Hash, Hasher};
524            let mut hasher = std::collections::hash_map::DefaultHasher::new();
525            text.hash(&mut hasher);
526            hasher.finish()
527        };
528        seen_hashes.insert(hash)
529    });
530}
531
532/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
533/// then delegate to the standard `select_artifacts`.
534pub fn select_artifacts_adaptive(
535    mut artifacts: Vec<ArtifactRecord>,
536    policy: &ContextPolicy,
537) -> Vec<ArtifactRecord> {
538    // Phase 1: deduplicate
539    dedup_artifacts(&mut artifacts);
540
541    // Phase 2: microcompact oversized artifacts relative to budget.
542    // Cap individual artifacts to a fraction of the total budget, but don't
543    // let the per-artifact cap exceed the total budget (avoid overrun).
544    if let Some(max_tokens) = policy.max_tokens {
545        let count = artifacts.len().max(1);
546        let per_artifact_budget = max_tokens / count;
547        // Floor of 500 tokens, but never more than total budget
548        let cap = per_artifact_budget.max(500).min(max_tokens);
549        for artifact in &mut artifacts {
550            let est = artifact.estimated_tokens.unwrap_or(0);
551            if est > cap * 2 {
552                microcompact_artifact(artifact, cap);
553            }
554        }
555    }
556
557    // Phase 3: standard selection with budget
558    select_artifacts(artifacts, policy)
559}
560
561// ── Per-agent policy with argument patterns ───────────────────────────
562
563/// Extended policy that supports argument-level constraints.
564#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
565#[serde(default)]
566pub struct ToolArgConstraint {
567    /// Tool name to constrain.
568    pub tool: String,
569    /// Glob patterns that the first string argument must match.
570    /// If empty, no argument constraint is applied.
571    pub arg_patterns: Vec<String>,
572}
573
574/// Check if a tool call satisfies argument constraints in the policy.
575pub fn enforce_tool_arg_constraints(
576    policy: &CapabilityPolicy,
577    tool_name: &str,
578    args: &serde_json::Value,
579) -> Result<(), VmError> {
580    for constraint in &policy.tool_arg_constraints {
581        if !glob_match(&constraint.tool, tool_name) {
582            continue;
583        }
584        if constraint.arg_patterns.is_empty() {
585            continue;
586        }
587        // Extract the first string-like argument for pattern matching
588        let first_arg = args
589            .as_object()
590            .and_then(|o| o.values().next())
591            .and_then(|v| v.as_str())
592            .or_else(|| args.as_str())
593            .unwrap_or("");
594        let matches = constraint
595            .arg_patterns
596            .iter()
597            .any(|pattern| glob_match(pattern, first_arg));
598        if !matches {
599            return reject_policy(format!(
600                "tool '{tool_name}' argument '{first_arg}' does not match allowed patterns: {:?}",
601                constraint.arg_patterns
602            ));
603        }
604    }
605    Ok(())
606}
607
608fn normalize_artifact_kind(kind: &str) -> String {
609    match kind {
610        "resource"
611        | "workspace_file"
612        | "editor_selection"
613        | "workspace_snapshot"
614        | "transcript_summary"
615        | "summary"
616        | "plan"
617        | "diff"
618        | "git_diff"
619        | "patch"
620        | "patch_set"
621        | "patch_proposal"
622        | "diff_review"
623        | "review_decision"
624        | "verification_bundle"
625        | "apply_intent"
626        | "verification_result"
627        | "test_result"
628        | "command_result"
629        | "provider_payload"
630        | "worker_result"
631        | "worker_notification"
632        | "artifact" => kind.to_string(),
633        "file" => "workspace_file".to_string(),
634        "transcript" => "transcript_summary".to_string(),
635        "verification" => "verification_result".to_string(),
636        "test" => "test_result".to_string(),
637        other if other.trim().is_empty() => "artifact".to_string(),
638        other => other.to_string(),
639    }
640}
641
642fn default_artifact_priority(kind: &str) -> i64 {
643    match kind {
644        "verification_result" | "test_result" => 100,
645        "verification_bundle" => 95,
646        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
647        | "review_decision" | "apply_intent" => 90,
648        "plan" => 80,
649        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
650        "summary" | "transcript_summary" => 60,
651        "command_result" => 50,
652        _ => 40,
653    }
654}
655
656fn freshness_rank(value: Option<&str>) -> i64 {
657    match value.unwrap_or_default() {
658        "fresh" | "live" => 3,
659        "recent" => 2,
660        "stale" => 0,
661        _ => 1,
662    }
663}
664
665#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
666#[serde(default)]
667pub struct ToolRuntimePolicyMetadata {
668    pub capabilities: BTreeMap<String, Vec<String>>,
669    pub side_effect_level: Option<String>,
670    pub path_params: Vec<String>,
671    pub mutation_classification: Option<String>,
672}
673
674#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
675#[serde(default)]
676pub struct CapabilityPolicy {
677    pub tools: Vec<String>,
678    pub capabilities: BTreeMap<String, Vec<String>>,
679    pub workspace_roots: Vec<String>,
680    pub side_effect_level: Option<String>,
681    pub recursion_limit: Option<usize>,
682    /// Argument-level constraints for specific tools.
683    #[serde(default)]
684    pub tool_arg_constraints: Vec<ToolArgConstraint>,
685    #[serde(default)]
686    pub tool_metadata: BTreeMap<String, ToolRuntimePolicyMetadata>,
687}
688
689impl CapabilityPolicy {
690    pub fn intersect(&self, requested: &CapabilityPolicy) -> Result<CapabilityPolicy, String> {
691        let side_effect_level = match (&self.side_effect_level, &requested.side_effect_level) {
692            (Some(a), Some(b)) => Some(min_side_effect(a, b).to_string()),
693            (Some(a), None) => Some(a.clone()),
694            (None, Some(b)) => Some(b.clone()),
695            (None, None) => None,
696        };
697
698        if !self.tools.is_empty() {
699            let denied: Vec<String> = requested
700                .tools
701                .iter()
702                .filter(|tool| !self.tools.contains(*tool))
703                .cloned()
704                .collect();
705            if !denied.is_empty() {
706                return Err(format!(
707                    "requested tools exceed host ceiling: {}",
708                    denied.join(", ")
709                ));
710            }
711        }
712
713        for (capability, requested_ops) in &requested.capabilities {
714            if let Some(allowed_ops) = self.capabilities.get(capability) {
715                let denied: Vec<String> = requested_ops
716                    .iter()
717                    .filter(|op| !allowed_ops.contains(*op))
718                    .cloned()
719                    .collect();
720                if !denied.is_empty() {
721                    return Err(format!(
722                        "requested capability operations exceed host ceiling: {}.{}",
723                        capability,
724                        denied.join(",")
725                    ));
726                }
727            } else if !self.capabilities.is_empty() {
728                return Err(format!(
729                    "requested capability exceeds host ceiling: {capability}"
730                ));
731            }
732        }
733
734        let tools = if self.tools.is_empty() {
735            requested.tools.clone()
736        } else if requested.tools.is_empty() {
737            self.tools.clone()
738        } else {
739            requested
740                .tools
741                .iter()
742                .filter(|tool| self.tools.contains(*tool))
743                .cloned()
744                .collect()
745        };
746
747        let capabilities = if self.capabilities.is_empty() {
748            requested.capabilities.clone()
749        } else if requested.capabilities.is_empty() {
750            self.capabilities.clone()
751        } else {
752            requested
753                .capabilities
754                .iter()
755                .filter_map(|(capability, requested_ops)| {
756                    self.capabilities.get(capability).map(|allowed_ops| {
757                        (
758                            capability.clone(),
759                            requested_ops
760                                .iter()
761                                .filter(|op| allowed_ops.contains(*op))
762                                .cloned()
763                                .collect::<Vec<_>>(),
764                        )
765                    })
766                })
767                .collect()
768        };
769
770        let workspace_roots = if self.workspace_roots.is_empty() {
771            requested.workspace_roots.clone()
772        } else if requested.workspace_roots.is_empty() {
773            self.workspace_roots.clone()
774        } else {
775            requested
776                .workspace_roots
777                .iter()
778                .filter(|root| self.workspace_roots.contains(*root))
779                .cloned()
780                .collect()
781        };
782
783        let recursion_limit = match (self.recursion_limit, requested.recursion_limit) {
784            (Some(a), Some(b)) => Some(a.min(b)),
785            (Some(a), None) => Some(a),
786            (None, Some(b)) => Some(b),
787            (None, None) => None,
788        };
789
790        // Merge arg constraints from both sides
791        let mut tool_arg_constraints = self.tool_arg_constraints.clone();
792        tool_arg_constraints.extend(requested.tool_arg_constraints.clone());
793
794        let tool_metadata = tools
795            .iter()
796            .filter_map(|tool| {
797                requested
798                    .tool_metadata
799                    .get(tool)
800                    .or_else(|| self.tool_metadata.get(tool))
801                    .cloned()
802                    .map(|metadata| (tool.clone(), metadata))
803            })
804            .collect();
805
806        Ok(CapabilityPolicy {
807            tools,
808            capabilities,
809            workspace_roots,
810            side_effect_level,
811            recursion_limit,
812            tool_arg_constraints,
813            tool_metadata,
814        })
815    }
816}
817
818fn min_side_effect<'a>(a: &'a str, b: &'a str) -> &'a str {
819    fn rank(v: &str) -> usize {
820        match v {
821            "none" => 0,
822            "read_only" => 1,
823            "workspace_write" => 2,
824            "process_exec" => 3,
825            "network" => 4,
826            _ => 5,
827        }
828    }
829    if rank(a) <= rank(b) {
830        a
831    } else {
832        b
833    }
834}
835
836#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
837#[serde(default)]
838pub struct ModelPolicy {
839    pub provider: Option<String>,
840    pub model: Option<String>,
841    pub model_tier: Option<String>,
842    pub temperature: Option<f64>,
843    pub max_tokens: Option<i64>,
844}
845
846#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
847#[serde(default)]
848pub struct TranscriptPolicy {
849    pub mode: Option<String>,
850    pub visibility: Option<String>,
851    pub summarize: bool,
852    pub compact: bool,
853    pub keep_last: Option<usize>,
854}
855
856#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
857#[serde(default)]
858pub struct ContextPolicy {
859    pub max_artifacts: Option<usize>,
860    pub max_tokens: Option<usize>,
861    pub reserve_tokens: Option<usize>,
862    pub include_kinds: Vec<String>,
863    pub exclude_kinds: Vec<String>,
864    pub prioritize_kinds: Vec<String>,
865    pub pinned_ids: Vec<String>,
866    pub include_stages: Vec<String>,
867    pub prefer_recent: bool,
868    pub prefer_fresh: bool,
869    pub render: Option<String>,
870}
871
872#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
873#[serde(default)]
874pub struct RetryPolicy {
875    pub max_attempts: usize,
876    pub verify: bool,
877    pub repair: bool,
878}
879
880#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
881#[serde(default)]
882pub struct StageContract {
883    pub input_kinds: Vec<String>,
884    pub output_kinds: Vec<String>,
885    pub min_inputs: Option<usize>,
886    pub max_inputs: Option<usize>,
887    pub require_transcript: bool,
888    pub schema: Option<serde_json::Value>,
889}
890
891#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
892#[serde(default)]
893pub struct BranchSemantics {
894    pub success: Option<String>,
895    pub failure: Option<String>,
896    pub verify_pass: Option<String>,
897    pub verify_fail: Option<String>,
898    pub condition_true: Option<String>,
899    pub condition_false: Option<String>,
900    pub loop_continue: Option<String>,
901    pub loop_exit: Option<String>,
902    pub escalation: Option<String>,
903}
904
905#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
906#[serde(default)]
907pub struct MapPolicy {
908    pub items: Vec<serde_json::Value>,
909    pub item_artifact_kind: Option<String>,
910    pub output_kind: Option<String>,
911    pub max_items: Option<usize>,
912}
913
914#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
915#[serde(default)]
916pub struct JoinPolicy {
917    pub strategy: String,
918    pub require_all_inputs: bool,
919    pub min_completed: Option<usize>,
920}
921
922#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
923#[serde(default)]
924pub struct ReducePolicy {
925    pub strategy: String,
926    pub separator: Option<String>,
927    pub output_kind: Option<String>,
928}
929
930#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
931#[serde(default)]
932pub struct EscalationPolicy {
933    pub level: Option<String>,
934    pub queue: Option<String>,
935    pub reason: Option<String>,
936}
937
938#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
939#[serde(default)]
940pub struct ArtifactRecord {
941    #[serde(rename = "_type")]
942    pub type_name: String,
943    pub id: String,
944    pub kind: String,
945    pub title: Option<String>,
946    pub text: Option<String>,
947    pub data: Option<serde_json::Value>,
948    pub source: Option<String>,
949    pub created_at: String,
950    pub freshness: Option<String>,
951    pub priority: Option<i64>,
952    pub lineage: Vec<String>,
953    pub relevance: Option<f64>,
954    pub estimated_tokens: Option<usize>,
955    pub stage: Option<String>,
956    pub metadata: BTreeMap<String, serde_json::Value>,
957}
958
959impl ArtifactRecord {
960    pub fn normalize(mut self) -> Self {
961        if self.type_name.is_empty() {
962            self.type_name = "artifact".to_string();
963        }
964        if self.id.is_empty() {
965            self.id = new_id("artifact");
966        }
967        if self.created_at.is_empty() {
968            self.created_at = now_rfc3339();
969        }
970        if self.kind.is_empty() {
971            self.kind = "artifact".to_string();
972        }
973        self.kind = normalize_artifact_kind(&self.kind);
974        if self.estimated_tokens.is_none() {
975            self.estimated_tokens = self
976                .text
977                .as_ref()
978                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
979        }
980        if self.priority.is_none() {
981            self.priority = Some(default_artifact_priority(&self.kind));
982        }
983        self
984    }
985}
986
987#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
988#[serde(default)]
989pub struct WorkflowNode {
990    pub id: Option<String>,
991    pub kind: String,
992    pub mode: Option<String>,
993    pub prompt: Option<String>,
994    pub system: Option<String>,
995    pub task_label: Option<String>,
996    pub done_sentinel: Option<String>,
997    pub tools: serde_json::Value,
998    pub model_policy: ModelPolicy,
999    pub transcript_policy: TranscriptPolicy,
1000    pub context_policy: ContextPolicy,
1001    pub retry_policy: RetryPolicy,
1002    pub capability_policy: CapabilityPolicy,
1003    pub input_contract: StageContract,
1004    pub output_contract: StageContract,
1005    pub branch_semantics: BranchSemantics,
1006    pub map_policy: MapPolicy,
1007    pub join_policy: JoinPolicy,
1008    pub reduce_policy: ReducePolicy,
1009    pub escalation_policy: EscalationPolicy,
1010    pub verify: Option<serde_json::Value>,
1011    pub metadata: BTreeMap<String, serde_json::Value>,
1012}
1013
1014pub fn workflow_tool_names(value: &serde_json::Value) -> Vec<String> {
1015    match value {
1016        serde_json::Value::Null => Vec::new(),
1017        serde_json::Value::Array(items) => items
1018            .iter()
1019            .filter_map(|item| match item {
1020                serde_json::Value::Object(map) => map
1021                    .get("name")
1022                    .and_then(|value| value.as_str())
1023                    .filter(|name| !name.is_empty())
1024                    .map(|name| name.to_string()),
1025                _ => None,
1026            })
1027            .collect(),
1028        serde_json::Value::Object(map) => {
1029            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1030                return map
1031                    .get("tools")
1032                    .map(workflow_tool_names)
1033                    .unwrap_or_default();
1034            }
1035            map.get("name")
1036                .and_then(|value| value.as_str())
1037                .filter(|name| !name.is_empty())
1038                .map(|name| vec![name.to_string()])
1039                .unwrap_or_default()
1040        }
1041        _ => Vec::new(),
1042    }
1043}
1044
1045fn max_side_effect_level(levels: impl Iterator<Item = String>) -> Option<String> {
1046    fn rank(v: &str) -> usize {
1047        match v {
1048            "none" => 0,
1049            "read_only" => 1,
1050            "workspace_write" => 2,
1051            "process_exec" => 3,
1052            "network" => 4,
1053            _ => 5,
1054        }
1055    }
1056    levels.max_by_key(|level| rank(level))
1057}
1058
1059fn parse_tool_runtime_policy(
1060    map: &serde_json::Map<String, serde_json::Value>,
1061) -> ToolRuntimePolicyMetadata {
1062    let Some(policy) = map.get("policy").and_then(|value| value.as_object()) else {
1063        return ToolRuntimePolicyMetadata::default();
1064    };
1065
1066    let capabilities = policy
1067        .get("capabilities")
1068        .and_then(|value| value.as_object())
1069        .map(|caps| {
1070            caps.iter()
1071                .map(|(capability, ops)| {
1072                    let values = ops
1073                        .as_array()
1074                        .map(|items| {
1075                            items
1076                                .iter()
1077                                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1078                                .collect::<Vec<_>>()
1079                        })
1080                        .unwrap_or_default();
1081                    (capability.clone(), values)
1082                })
1083                .collect::<BTreeMap<_, _>>()
1084        })
1085        .unwrap_or_default();
1086
1087    let path_params = policy
1088        .get("path_params")
1089        .and_then(|value| value.as_array())
1090        .map(|items| {
1091            items
1092                .iter()
1093                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1094                .collect::<Vec<_>>()
1095        })
1096        .unwrap_or_default();
1097
1098    ToolRuntimePolicyMetadata {
1099        capabilities,
1100        side_effect_level: policy
1101            .get("side_effect_level")
1102            .and_then(|value| value.as_str())
1103            .map(|s| s.to_string()),
1104        path_params,
1105        mutation_classification: policy
1106            .get("mutation_classification")
1107            .and_then(|value| value.as_str())
1108            .map(|s| s.to_string()),
1109    }
1110}
1111
1112pub fn workflow_tool_metadata(
1113    value: &serde_json::Value,
1114) -> BTreeMap<String, ToolRuntimePolicyMetadata> {
1115    match value {
1116        serde_json::Value::Null => BTreeMap::new(),
1117        serde_json::Value::Array(items) => items
1118            .iter()
1119            .filter_map(|item| match item {
1120                serde_json::Value::Object(map) => map
1121                    .get("name")
1122                    .and_then(|value| value.as_str())
1123                    .filter(|name| !name.is_empty())
1124                    .map(|name| (name.to_string(), parse_tool_runtime_policy(map))),
1125                _ => None,
1126            })
1127            .collect(),
1128        serde_json::Value::Object(map) => {
1129            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1130                return map
1131                    .get("tools")
1132                    .map(workflow_tool_metadata)
1133                    .unwrap_or_default();
1134            }
1135            map.get("name")
1136                .and_then(|value| value.as_str())
1137                .filter(|name| !name.is_empty())
1138                .map(|name| {
1139                    let mut metadata = BTreeMap::new();
1140                    metadata.insert(name.to_string(), parse_tool_runtime_policy(map));
1141                    metadata
1142                })
1143                .unwrap_or_default()
1144        }
1145        _ => BTreeMap::new(),
1146    }
1147}
1148
1149pub fn workflow_tool_policy_from_tools(value: &serde_json::Value) -> CapabilityPolicy {
1150    let tools = workflow_tool_names(value);
1151    let tool_metadata = workflow_tool_metadata(value);
1152    let mut capabilities: BTreeMap<String, Vec<String>> = BTreeMap::new();
1153    for metadata in tool_metadata.values() {
1154        for (capability, ops) in &metadata.capabilities {
1155            let entry = capabilities.entry(capability.clone()).or_default();
1156            for op in ops {
1157                if !entry.contains(op) {
1158                    entry.push(op.clone());
1159                }
1160            }
1161            entry.sort();
1162        }
1163    }
1164    let side_effect_level = max_side_effect_level(
1165        tool_metadata
1166            .values()
1167            .filter_map(|metadata| metadata.side_effect_level.clone()),
1168    );
1169    CapabilityPolicy {
1170        tools,
1171        capabilities,
1172        workspace_roots: Vec::new(),
1173        side_effect_level,
1174        recursion_limit: None,
1175        tool_arg_constraints: Vec::new(),
1176        tool_metadata,
1177    }
1178}
1179
1180#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1181#[serde(default)]
1182pub struct WorkflowEdge {
1183    pub from: String,
1184    pub to: String,
1185    pub branch: Option<String>,
1186    pub label: Option<String>,
1187}
1188
1189#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1190#[serde(default)]
1191pub struct WorkflowGraph {
1192    #[serde(rename = "_type")]
1193    pub type_name: String,
1194    pub id: String,
1195    pub name: Option<String>,
1196    pub version: usize,
1197    pub entry: String,
1198    pub nodes: BTreeMap<String, WorkflowNode>,
1199    pub edges: Vec<WorkflowEdge>,
1200    pub capability_policy: CapabilityPolicy,
1201    pub metadata: BTreeMap<String, serde_json::Value>,
1202    pub audit_log: Vec<WorkflowAuditEntry>,
1203}
1204
1205#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1206#[serde(default)]
1207pub struct WorkflowAuditEntry {
1208    pub id: String,
1209    pub op: String,
1210    pub node_id: Option<String>,
1211    pub timestamp: String,
1212    pub reason: Option<String>,
1213    pub metadata: BTreeMap<String, serde_json::Value>,
1214}
1215
1216#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1217#[serde(default)]
1218pub struct LlmUsageRecord {
1219    pub input_tokens: i64,
1220    pub output_tokens: i64,
1221    pub total_duration_ms: i64,
1222    pub call_count: i64,
1223    pub total_cost: f64,
1224    pub models: Vec<String>,
1225}
1226
1227#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1228#[serde(default)]
1229pub struct RunStageRecord {
1230    pub id: String,
1231    pub node_id: String,
1232    pub kind: String,
1233    pub status: String,
1234    pub outcome: String,
1235    pub branch: Option<String>,
1236    pub started_at: String,
1237    pub finished_at: Option<String>,
1238    pub visible_text: Option<String>,
1239    pub private_reasoning: Option<String>,
1240    pub transcript: Option<serde_json::Value>,
1241    pub verification: Option<serde_json::Value>,
1242    pub usage: Option<LlmUsageRecord>,
1243    pub artifacts: Vec<ArtifactRecord>,
1244    pub consumed_artifact_ids: Vec<String>,
1245    pub produced_artifact_ids: Vec<String>,
1246    pub attempts: Vec<RunStageAttemptRecord>,
1247    pub metadata: BTreeMap<String, serde_json::Value>,
1248}
1249
1250#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1251#[serde(default)]
1252pub struct RunStageAttemptRecord {
1253    pub attempt: usize,
1254    pub status: String,
1255    pub outcome: String,
1256    pub branch: Option<String>,
1257    pub error: Option<String>,
1258    pub verification: Option<serde_json::Value>,
1259    pub started_at: String,
1260    pub finished_at: Option<String>,
1261}
1262
1263#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1264#[serde(default)]
1265pub struct RunTransitionRecord {
1266    pub id: String,
1267    pub from_stage_id: Option<String>,
1268    pub from_node_id: Option<String>,
1269    pub to_node_id: String,
1270    pub branch: Option<String>,
1271    pub timestamp: String,
1272    pub consumed_artifact_ids: Vec<String>,
1273    pub produced_artifact_ids: Vec<String>,
1274}
1275
1276#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1277#[serde(default)]
1278pub struct RunCheckpointRecord {
1279    pub id: String,
1280    pub ready_nodes: Vec<String>,
1281    pub completed_nodes: Vec<String>,
1282    pub last_stage_id: Option<String>,
1283    pub persisted_at: String,
1284    pub reason: String,
1285}
1286
1287#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1288#[serde(default)]
1289pub struct ReplayFixture {
1290    #[serde(rename = "_type")]
1291    pub type_name: String,
1292    pub id: String,
1293    pub source_run_id: String,
1294    pub workflow_id: String,
1295    pub workflow_name: Option<String>,
1296    pub created_at: String,
1297    pub expected_status: String,
1298    pub stage_assertions: Vec<ReplayStageAssertion>,
1299}
1300
1301#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1302#[serde(default)]
1303pub struct ReplayStageAssertion {
1304    pub node_id: String,
1305    pub expected_status: String,
1306    pub expected_outcome: String,
1307    pub expected_branch: Option<String>,
1308    pub required_artifact_kinds: Vec<String>,
1309    pub visible_text_contains: Option<String>,
1310}
1311
1312#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1313#[serde(default)]
1314pub struct ReplayEvalReport {
1315    pub pass: bool,
1316    pub failures: Vec<String>,
1317    pub stage_count: usize,
1318}
1319
1320#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1321#[serde(default)]
1322pub struct ReplayEvalCaseReport {
1323    pub run_id: String,
1324    pub workflow_id: String,
1325    pub label: Option<String>,
1326    pub pass: bool,
1327    pub failures: Vec<String>,
1328    pub stage_count: usize,
1329    pub source_path: Option<String>,
1330    pub comparison: Option<RunDiffReport>,
1331}
1332
1333#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1334#[serde(default)]
1335pub struct ReplayEvalSuiteReport {
1336    pub pass: bool,
1337    pub total: usize,
1338    pub passed: usize,
1339    pub failed: usize,
1340    pub cases: Vec<ReplayEvalCaseReport>,
1341}
1342
1343#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1344#[serde(default)]
1345pub struct RunStageDiffRecord {
1346    pub node_id: String,
1347    pub change: String,
1348    pub details: Vec<String>,
1349}
1350
1351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1352#[serde(default)]
1353pub struct RunDiffReport {
1354    pub left_run_id: String,
1355    pub right_run_id: String,
1356    pub identical: bool,
1357    pub status_changed: bool,
1358    pub left_status: String,
1359    pub right_status: String,
1360    pub stage_diffs: Vec<RunStageDiffRecord>,
1361    pub transition_count_delta: isize,
1362    pub artifact_count_delta: isize,
1363    pub checkpoint_count_delta: isize,
1364}
1365
1366#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1367#[serde(default)]
1368pub struct EvalSuiteManifest {
1369    #[serde(rename = "_type")]
1370    pub type_name: String,
1371    pub id: String,
1372    pub name: Option<String>,
1373    pub base_dir: Option<String>,
1374    pub cases: Vec<EvalSuiteCase>,
1375}
1376
1377#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1378#[serde(default)]
1379pub struct EvalSuiteCase {
1380    pub label: Option<String>,
1381    pub run_path: String,
1382    pub fixture_path: Option<String>,
1383    pub compare_to: Option<String>,
1384}
1385
1386#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1387#[serde(default)]
1388pub struct RunRecord {
1389    #[serde(rename = "_type")]
1390    pub type_name: String,
1391    pub id: String,
1392    pub workflow_id: String,
1393    pub workflow_name: Option<String>,
1394    pub task: String,
1395    pub status: String,
1396    pub started_at: String,
1397    pub finished_at: Option<String>,
1398    pub parent_run_id: Option<String>,
1399    pub root_run_id: Option<String>,
1400    pub stages: Vec<RunStageRecord>,
1401    pub transitions: Vec<RunTransitionRecord>,
1402    pub checkpoints: Vec<RunCheckpointRecord>,
1403    pub pending_nodes: Vec<String>,
1404    pub completed_nodes: Vec<String>,
1405    pub child_runs: Vec<RunChildRecord>,
1406    pub artifacts: Vec<ArtifactRecord>,
1407    pub policy: CapabilityPolicy,
1408    pub execution: Option<RunExecutionRecord>,
1409    pub transcript: Option<serde_json::Value>,
1410    pub usage: Option<LlmUsageRecord>,
1411    pub replay_fixture: Option<ReplayFixture>,
1412    pub metadata: BTreeMap<String, serde_json::Value>,
1413    pub persisted_path: Option<String>,
1414}
1415
1416#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1417#[serde(default)]
1418pub struct RunChildRecord {
1419    pub worker_id: String,
1420    pub worker_name: String,
1421    pub parent_stage_id: Option<String>,
1422    pub session_id: Option<String>,
1423    pub parent_session_id: Option<String>,
1424    pub mutation_scope: Option<String>,
1425    pub approval_mode: Option<String>,
1426    pub task: String,
1427    pub status: String,
1428    pub started_at: String,
1429    pub finished_at: Option<String>,
1430    pub run_id: Option<String>,
1431    pub run_path: Option<String>,
1432    pub snapshot_path: Option<String>,
1433    pub execution: Option<RunExecutionRecord>,
1434}
1435
1436#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1437#[serde(default)]
1438pub struct RunExecutionRecord {
1439    pub cwd: Option<String>,
1440    pub source_dir: Option<String>,
1441    pub env: BTreeMap<String, String>,
1442    pub adapter: Option<String>,
1443    pub repo_path: Option<String>,
1444    pub worktree_path: Option<String>,
1445    pub branch: Option<String>,
1446    pub base_ref: Option<String>,
1447    pub cleanup: Option<String>,
1448}
1449
1450#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1451#[serde(default)]
1452pub struct WorkflowValidationReport {
1453    pub valid: bool,
1454    pub errors: Vec<String>,
1455    pub warnings: Vec<String>,
1456    pub reachable_nodes: Vec<String>,
1457}
1458
1459fn parse_json_payload<T: for<'de> Deserialize<'de>>(
1460    json: serde_json::Value,
1461    label: &str,
1462) -> Result<T, VmError> {
1463    let payload = json.to_string();
1464    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1465    let mut tracker = serde_path_to_error::Track::new();
1466    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1467    T::deserialize(path_deserializer).map_err(|error| {
1468        let snippet = if payload.len() > 600 {
1469            format!("{}...", &payload[..600])
1470        } else {
1471            payload.clone()
1472        };
1473        VmError::Runtime(format!(
1474            "{label} parse error at {}: {} | payload={}",
1475            tracker.path(),
1476            error,
1477            snippet
1478        ))
1479    })
1480}
1481
1482fn parse_json_value<T: for<'de> Deserialize<'de>>(value: &VmValue) -> Result<T, VmError> {
1483    parse_json_payload(vm_value_to_json(value), "orchestration")
1484}
1485
1486pub fn parse_workflow_node_value(value: &VmValue, label: &str) -> Result<WorkflowNode, VmError> {
1487    parse_json_payload(vm_value_to_json(value), label)
1488}
1489
1490pub fn parse_workflow_node_json(
1491    json: serde_json::Value,
1492    label: &str,
1493) -> Result<WorkflowNode, VmError> {
1494    parse_json_payload(json, label)
1495}
1496
1497pub fn parse_workflow_edge_json(
1498    json: serde_json::Value,
1499    label: &str,
1500) -> Result<WorkflowEdge, VmError> {
1501    parse_json_payload(json, label)
1502}
1503
1504pub fn normalize_workflow_value(value: &VmValue) -> Result<WorkflowGraph, VmError> {
1505    let mut graph: WorkflowGraph = parse_json_value(value)?;
1506    let as_dict = value.as_dict().cloned().unwrap_or_default();
1507
1508    if graph.nodes.is_empty() {
1509        for key in ["act", "verify", "repair"] {
1510            if let Some(node_value) = as_dict.get(key) {
1511                let mut node: WorkflowNode = parse_json_value(node_value)?;
1512                let raw_node = node_value.as_dict().cloned().unwrap_or_default();
1513                node.id = Some(key.to_string());
1514                if node.kind.is_empty() {
1515                    node.kind = if key == "verify" {
1516                        "verify".to_string()
1517                    } else {
1518                        "stage".to_string()
1519                    };
1520                }
1521                if node.model_policy.provider.is_none() {
1522                    node.model_policy.provider = as_dict
1523                        .get("provider")
1524                        .map(|value| value.display())
1525                        .filter(|value| !value.is_empty());
1526                }
1527                if node.model_policy.model.is_none() {
1528                    node.model_policy.model = as_dict
1529                        .get("model")
1530                        .map(|value| value.display())
1531                        .filter(|value| !value.is_empty());
1532                }
1533                if node.model_policy.model_tier.is_none() {
1534                    node.model_policy.model_tier = as_dict
1535                        .get("model_tier")
1536                        .or_else(|| as_dict.get("tier"))
1537                        .map(|value| value.display())
1538                        .filter(|value| !value.is_empty());
1539                }
1540                if node.model_policy.temperature.is_none() {
1541                    node.model_policy.temperature = as_dict.get("temperature").and_then(|value| {
1542                        if let VmValue::Float(number) = value {
1543                            Some(*number)
1544                        } else {
1545                            value.as_int().map(|number| number as f64)
1546                        }
1547                    });
1548                }
1549                if node.model_policy.max_tokens.is_none() {
1550                    node.model_policy.max_tokens =
1551                        as_dict.get("max_tokens").and_then(|value| value.as_int());
1552                }
1553                if node.mode.is_none() {
1554                    node.mode = as_dict
1555                        .get("mode")
1556                        .map(|value| value.display())
1557                        .filter(|value| !value.is_empty());
1558                }
1559                if node.done_sentinel.is_none() {
1560                    node.done_sentinel = as_dict
1561                        .get("done_sentinel")
1562                        .map(|value| value.display())
1563                        .filter(|value| !value.is_empty());
1564                }
1565                if key == "verify"
1566                    && node.verify.is_none()
1567                    && (raw_node.contains_key("assert_text")
1568                        || raw_node.contains_key("command")
1569                        || raw_node.contains_key("expect_status")
1570                        || raw_node.contains_key("expect_text"))
1571                {
1572                    node.verify = Some(serde_json::json!({
1573                        "assert_text": raw_node.get("assert_text").map(vm_value_to_json),
1574                        "command": raw_node.get("command").map(vm_value_to_json),
1575                        "expect_status": raw_node.get("expect_status").map(vm_value_to_json),
1576                        "expect_text": raw_node.get("expect_text").map(vm_value_to_json),
1577                    }));
1578                }
1579                graph.nodes.insert(key.to_string(), node);
1580            }
1581        }
1582        if graph.entry.is_empty() && graph.nodes.contains_key("act") {
1583            graph.entry = "act".to_string();
1584        }
1585        if graph.edges.is_empty() && graph.nodes.contains_key("act") {
1586            if graph.nodes.contains_key("verify") {
1587                graph.edges.push(WorkflowEdge {
1588                    from: "act".to_string(),
1589                    to: "verify".to_string(),
1590                    branch: None,
1591                    label: None,
1592                });
1593            }
1594            if graph.nodes.contains_key("repair") {
1595                graph.edges.push(WorkflowEdge {
1596                    from: "verify".to_string(),
1597                    to: "repair".to_string(),
1598                    branch: Some("failed".to_string()),
1599                    label: None,
1600                });
1601                graph.edges.push(WorkflowEdge {
1602                    from: "repair".to_string(),
1603                    to: "verify".to_string(),
1604                    branch: Some("retry".to_string()),
1605                    label: None,
1606                });
1607            }
1608        }
1609    }
1610
1611    if graph.type_name.is_empty() {
1612        graph.type_name = "workflow_graph".to_string();
1613    }
1614    if graph.id.is_empty() {
1615        graph.id = new_id("workflow");
1616    }
1617    if graph.version == 0 {
1618        graph.version = 1;
1619    }
1620    if graph.entry.is_empty() {
1621        graph.entry = graph
1622            .nodes
1623            .keys()
1624            .next()
1625            .cloned()
1626            .unwrap_or_else(|| "act".to_string());
1627    }
1628    for (node_id, node) in &mut graph.nodes {
1629        if node.id.is_none() {
1630            node.id = Some(node_id.clone());
1631        }
1632        if node.kind.is_empty() {
1633            node.kind = "stage".to_string();
1634        }
1635        if node.join_policy.strategy.is_empty() {
1636            node.join_policy.strategy = "all".to_string();
1637        }
1638        if node.reduce_policy.strategy.is_empty() {
1639            node.reduce_policy.strategy = "concat".to_string();
1640        }
1641        if node.output_contract.output_kinds.is_empty() {
1642            node.output_contract.output_kinds = vec![match node.kind.as_str() {
1643                "verify" => "verification_result".to_string(),
1644                "reduce" => node
1645                    .reduce_policy
1646                    .output_kind
1647                    .clone()
1648                    .unwrap_or_else(|| "summary".to_string()),
1649                "map" => node
1650                    .map_policy
1651                    .output_kind
1652                    .clone()
1653                    .unwrap_or_else(|| "artifact".to_string()),
1654                "escalation" => "plan".to_string(),
1655                _ => "artifact".to_string(),
1656            }];
1657        }
1658        if node.retry_policy.max_attempts == 0 {
1659            node.retry_policy.max_attempts = 1;
1660        }
1661    }
1662    Ok(graph)
1663}
1664
1665pub fn validate_workflow(
1666    graph: &WorkflowGraph,
1667    ceiling: Option<&CapabilityPolicy>,
1668) -> WorkflowValidationReport {
1669    let mut errors = Vec::new();
1670    let mut warnings = Vec::new();
1671
1672    if !graph.nodes.contains_key(&graph.entry) {
1673        errors.push(format!("entry node does not exist: {}", graph.entry));
1674    }
1675
1676    let node_ids: BTreeSet<String> = graph.nodes.keys().cloned().collect();
1677    for edge in &graph.edges {
1678        if !node_ids.contains(&edge.from) {
1679            errors.push(format!("edge.from references unknown node: {}", edge.from));
1680        }
1681        if !node_ids.contains(&edge.to) {
1682            errors.push(format!("edge.to references unknown node: {}", edge.to));
1683        }
1684    }
1685
1686    let reachable_nodes = reachable_nodes(graph);
1687    for node_id in &node_ids {
1688        if !reachable_nodes.contains(node_id) {
1689            warnings.push(format!("node is unreachable: {node_id}"));
1690        }
1691    }
1692
1693    for (node_id, node) in &graph.nodes {
1694        let incoming = graph
1695            .edges
1696            .iter()
1697            .filter(|edge| edge.to == *node_id)
1698            .count();
1699        let outgoing: Vec<&WorkflowEdge> = graph
1700            .edges
1701            .iter()
1702            .filter(|edge| edge.from == *node_id)
1703            .collect();
1704        if let Some(min_inputs) = node.input_contract.min_inputs {
1705            if let Some(max_inputs) = node.input_contract.max_inputs {
1706                if min_inputs > max_inputs {
1707                    errors.push(format!(
1708                        "node {node_id}: input contract min_inputs exceeds max_inputs"
1709                    ));
1710                }
1711            }
1712        }
1713        match node.kind.as_str() {
1714            "condition" => {
1715                let has_true = outgoing
1716                    .iter()
1717                    .any(|edge| edge.branch.as_deref() == Some("true"));
1718                let has_false = outgoing
1719                    .iter()
1720                    .any(|edge| edge.branch.as_deref() == Some("false"));
1721                if !has_true || !has_false {
1722                    errors.push(format!(
1723                        "node {node_id}: condition nodes require both 'true' and 'false' branch edges"
1724                    ));
1725                }
1726            }
1727            "fork" => {
1728                if outgoing.len() < 2 {
1729                    errors.push(format!(
1730                        "node {node_id}: fork nodes require at least two outgoing edges"
1731                    ));
1732                }
1733            }
1734            "join" => {
1735                if incoming < 2 {
1736                    warnings.push(format!(
1737                        "node {node_id}: join node has fewer than two incoming edges"
1738                    ));
1739                }
1740            }
1741            "map" => {
1742                if node.map_policy.items.is_empty()
1743                    && node.map_policy.item_artifact_kind.is_none()
1744                    && node.input_contract.input_kinds.is_empty()
1745                {
1746                    errors.push(format!(
1747                        "node {node_id}: map nodes require items, item_artifact_kind, or input_contract.input_kinds"
1748                    ));
1749                }
1750            }
1751            "reduce" => {
1752                if node.input_contract.input_kinds.is_empty() {
1753                    warnings.push(format!(
1754                        "node {node_id}: reduce node has no input_contract.input_kinds; it will consume all available artifacts"
1755                    ));
1756                }
1757            }
1758            _ => {}
1759        }
1760    }
1761
1762    if let Some(ceiling) = ceiling {
1763        if let Err(error) = ceiling.intersect(&graph.capability_policy) {
1764            errors.push(error);
1765        }
1766        for (node_id, node) in &graph.nodes {
1767            if let Err(error) = ceiling.intersect(&node.capability_policy) {
1768                errors.push(format!("node {node_id}: {error}"));
1769            }
1770        }
1771    }
1772
1773    WorkflowValidationReport {
1774        valid: errors.is_empty(),
1775        errors,
1776        warnings,
1777        reachable_nodes: reachable_nodes.into_iter().collect(),
1778    }
1779}
1780
1781fn reachable_nodes(graph: &WorkflowGraph) -> BTreeSet<String> {
1782    let mut seen = BTreeSet::new();
1783    let mut stack = vec![graph.entry.clone()];
1784    while let Some(node_id) = stack.pop() {
1785        if !seen.insert(node_id.clone()) {
1786            continue;
1787        }
1788        for edge in graph.edges.iter().filter(|edge| edge.from == node_id) {
1789            stack.push(edge.to.clone());
1790        }
1791    }
1792    seen
1793}
1794
1795pub fn select_artifacts(
1796    mut artifacts: Vec<ArtifactRecord>,
1797    policy: &ContextPolicy,
1798) -> Vec<ArtifactRecord> {
1799    artifacts.retain(|artifact| {
1800        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
1801            && !policy.exclude_kinds.contains(&artifact.kind)
1802            && (policy.include_stages.is_empty()
1803                || artifact
1804                    .stage
1805                    .as_ref()
1806                    .is_some_and(|stage| policy.include_stages.contains(stage)))
1807    });
1808    artifacts.sort_by(|a, b| {
1809        let b_pinned = policy.pinned_ids.contains(&b.id);
1810        let a_pinned = policy.pinned_ids.contains(&a.id);
1811        b_pinned
1812            .cmp(&a_pinned)
1813            .then_with(|| {
1814                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
1815                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
1816                b_prio_kind.cmp(&a_prio_kind)
1817            })
1818            .then_with(|| {
1819                b.priority
1820                    .unwrap_or_default()
1821                    .cmp(&a.priority.unwrap_or_default())
1822            })
1823            .then_with(|| {
1824                if policy.prefer_fresh {
1825                    freshness_rank(b.freshness.as_deref())
1826                        .cmp(&freshness_rank(a.freshness.as_deref()))
1827                } else {
1828                    std::cmp::Ordering::Equal
1829                }
1830            })
1831            .then_with(|| {
1832                if policy.prefer_recent {
1833                    b.created_at.cmp(&a.created_at)
1834                } else {
1835                    std::cmp::Ordering::Equal
1836                }
1837            })
1838            .then_with(|| {
1839                b.relevance
1840                    .partial_cmp(&a.relevance)
1841                    .unwrap_or(std::cmp::Ordering::Equal)
1842            })
1843            .then_with(|| {
1844                a.estimated_tokens
1845                    .unwrap_or(usize::MAX)
1846                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
1847            })
1848    });
1849
1850    let mut selected = Vec::new();
1851    let mut used_tokens = 0usize;
1852    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
1853    let effective_max_tokens = policy
1854        .max_tokens
1855        .map(|max| max.saturating_sub(reserve_tokens));
1856    for artifact in artifacts {
1857        if let Some(max_artifacts) = policy.max_artifacts {
1858            if selected.len() >= max_artifacts {
1859                break;
1860            }
1861        }
1862        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
1863        if let Some(max_tokens) = effective_max_tokens {
1864            if used_tokens + next_tokens > max_tokens {
1865                continue;
1866            }
1867        }
1868        used_tokens += next_tokens;
1869        selected.push(artifact);
1870    }
1871    selected
1872}
1873
1874pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
1875    let mut parts = Vec::new();
1876    for artifact in artifacts {
1877        let title = artifact
1878            .title
1879            .clone()
1880            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
1881        let body = artifact
1882            .text
1883            .clone()
1884            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
1885            .unwrap_or_default();
1886        match policy.render.as_deref() {
1887            Some("json") => {
1888                parts.push(
1889                    serde_json::json!({
1890                        "id": artifact.id,
1891                        "kind": artifact.kind,
1892                        "title": title,
1893                        "source": artifact.source,
1894                        "freshness": artifact.freshness,
1895                        "priority": artifact.priority,
1896                        "text": body,
1897                    })
1898                    .to_string(),
1899                );
1900            }
1901            _ => parts.push(format!(
1902                "[{title}] kind={} source={} freshness={} priority={}\n{}",
1903                artifact.kind,
1904                artifact
1905                    .source
1906                    .clone()
1907                    .unwrap_or_else(|| "unknown".to_string()),
1908                artifact
1909                    .freshness
1910                    .clone()
1911                    .unwrap_or_else(|| "normal".to_string()),
1912                artifact.priority.unwrap_or_default(),
1913                body
1914            )),
1915        }
1916    }
1917    parts.join("\n\n")
1918}
1919
1920pub fn normalize_artifact(value: &VmValue) -> Result<ArtifactRecord, VmError> {
1921    let artifact: ArtifactRecord = parse_json_value(value)?;
1922    Ok(artifact.normalize())
1923}
1924
1925pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1926    let json = vm_value_to_json(value);
1927    let payload = json.to_string();
1928    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1929    let mut tracker = serde_path_to_error::Track::new();
1930    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1931    let mut run: RunRecord = RunRecord::deserialize(path_deserializer).map_err(|error| {
1932        let snippet = if payload.len() > 600 {
1933            format!("{}...", &payload[..600])
1934        } else {
1935            payload.clone()
1936        };
1937        VmError::Runtime(format!(
1938            "orchestration parse error at {}: {} | payload={}",
1939            tracker.path(),
1940            error,
1941            snippet
1942        ))
1943    })?;
1944    if run.type_name.is_empty() {
1945        run.type_name = "run_record".to_string();
1946    }
1947    if run.id.is_empty() {
1948        run.id = new_id("run");
1949    }
1950    if run.started_at.is_empty() {
1951        run.started_at = now_rfc3339();
1952    }
1953    if run.status.is_empty() {
1954        run.status = "running".to_string();
1955    }
1956    if run.root_run_id.is_none() {
1957        run.root_run_id = Some(run.id.clone());
1958    }
1959    if run.replay_fixture.is_none() {
1960        run.replay_fixture = Some(replay_fixture_from_run(&run));
1961    }
1962    Ok(run)
1963}
1964
1965pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1966    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1967    if manifest.type_name.is_empty() {
1968        manifest.type_name = "eval_suite_manifest".to_string();
1969    }
1970    if manifest.id.is_empty() {
1971        manifest.id = new_id("eval_suite");
1972    }
1973    Ok(manifest)
1974}
1975
1976fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1977    let content = std::fs::read_to_string(path)
1978        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1979    serde_json::from_str(&content)
1980        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1981}
1982
1983fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1984    let path_buf = PathBuf::from(path);
1985    if path_buf.is_absolute() {
1986        path_buf
1987    } else if let Some(base_dir) = base_dir {
1988        base_dir.join(path_buf)
1989    } else {
1990        path_buf
1991    }
1992}
1993
1994pub fn evaluate_run_suite_manifest(
1995    manifest: &EvalSuiteManifest,
1996) -> Result<ReplayEvalSuiteReport, VmError> {
1997    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1998    let mut reports = Vec::new();
1999    for case in &manifest.cases {
2000        let run_path = resolve_manifest_path(base_dir, &case.run_path);
2001        let run = load_run_record(&run_path)?;
2002        let fixture = match &case.fixture_path {
2003            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
2004            None => run
2005                .replay_fixture
2006                .clone()
2007                .unwrap_or_else(|| replay_fixture_from_run(&run)),
2008        };
2009        let eval = evaluate_run_against_fixture(&run, &fixture);
2010        let mut pass = eval.pass;
2011        let mut failures = eval.failures;
2012        let comparison = match &case.compare_to {
2013            Some(path) => {
2014                let baseline_path = resolve_manifest_path(base_dir, path);
2015                let baseline = load_run_record(&baseline_path)?;
2016                let diff = diff_run_records(&baseline, &run);
2017                if !diff.identical {
2018                    pass = false;
2019                    failures.push(format!(
2020                        "run differs from baseline {} with {} stage changes",
2021                        baseline_path.display(),
2022                        diff.stage_diffs.len()
2023                    ));
2024                }
2025                Some(diff)
2026            }
2027            None => None,
2028        };
2029        reports.push(ReplayEvalCaseReport {
2030            run_id: run.id.clone(),
2031            workflow_id: run.workflow_id.clone(),
2032            label: case.label.clone(),
2033            pass,
2034            failures,
2035            stage_count: eval.stage_count,
2036            source_path: Some(run_path.display().to_string()),
2037            comparison,
2038        });
2039    }
2040    let total = reports.len();
2041    let passed = reports.iter().filter(|report| report.pass).count();
2042    let failed = total.saturating_sub(passed);
2043    Ok(ReplayEvalSuiteReport {
2044        pass: failed == 0,
2045        total,
2046        passed,
2047        failed,
2048        cases: reports,
2049    })
2050}
2051
2052pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2053    let before_lines: Vec<&str> = before.lines().collect();
2054    let after_lines: Vec<&str> = after.lines().collect();
2055    let mut table = vec![vec![0usize; after_lines.len() + 1]; before_lines.len() + 1];
2056    for i in (0..before_lines.len()).rev() {
2057        for j in (0..after_lines.len()).rev() {
2058            table[i][j] = if before_lines[i] == after_lines[j] {
2059                table[i + 1][j + 1] + 1
2060            } else {
2061                table[i + 1][j].max(table[i][j + 1])
2062            };
2063        }
2064    }
2065
2066    let mut diff = String::new();
2067    let file = path.unwrap_or("artifact");
2068    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2069    let mut i = 0;
2070    let mut j = 0;
2071    while i < before_lines.len() && j < after_lines.len() {
2072        if before_lines[i] == after_lines[j] {
2073            diff.push_str(&format!(" {}\n", before_lines[i]));
2074            i += 1;
2075            j += 1;
2076        } else if table[i + 1][j] >= table[i][j + 1] {
2077            diff.push_str(&format!("-{}\n", before_lines[i]));
2078            i += 1;
2079        } else {
2080            diff.push_str(&format!("+{}\n", after_lines[j]));
2081            j += 1;
2082        }
2083    }
2084    while i < before_lines.len() {
2085        diff.push_str(&format!("-{}\n", before_lines[i]));
2086        i += 1;
2087    }
2088    while j < after_lines.len() {
2089        diff.push_str(&format!("+{}\n", after_lines[j]));
2090        j += 1;
2091    }
2092    diff
2093}
2094
2095pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2096    let path = path
2097        .map(PathBuf::from)
2098        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2099    if let Some(parent) = path.parent() {
2100        std::fs::create_dir_all(parent)
2101            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2102    }
2103    let json = serde_json::to_string_pretty(run)
2104        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2105    // Atomic write: write to .tmp then rename to prevent corruption on kill.
2106    let tmp_path = path.with_extension("json.tmp");
2107    std::fs::write(&tmp_path, &json)
2108        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2109    std::fs::rename(&tmp_path, &path).map_err(|e| {
2110        // Fallback: if rename fails (cross-device), write directly.
2111        let _ = std::fs::write(&path, &json);
2112        VmError::Runtime(format!("failed to finalize run record: {e}"))
2113    })?;
2114    Ok(path.to_string_lossy().to_string())
2115}
2116
2117pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2118    let content = std::fs::read_to_string(path)
2119        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2120    serde_json::from_str(&content)
2121        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))
2122}
2123
2124pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2125    ReplayFixture {
2126        type_name: "replay_fixture".to_string(),
2127        id: new_id("fixture"),
2128        source_run_id: run.id.clone(),
2129        workflow_id: run.workflow_id.clone(),
2130        workflow_name: run.workflow_name.clone(),
2131        created_at: now_rfc3339(),
2132        expected_status: run.status.clone(),
2133        stage_assertions: run
2134            .stages
2135            .iter()
2136            .map(|stage| ReplayStageAssertion {
2137                node_id: stage.node_id.clone(),
2138                expected_status: stage.status.clone(),
2139                expected_outcome: stage.outcome.clone(),
2140                expected_branch: stage.branch.clone(),
2141                required_artifact_kinds: stage
2142                    .artifacts
2143                    .iter()
2144                    .map(|artifact| artifact.kind.clone())
2145                    .collect(),
2146                visible_text_contains: stage
2147                    .visible_text
2148                    .as_ref()
2149                    .filter(|text| !text.is_empty())
2150                    .map(|text| text.chars().take(80).collect()),
2151            })
2152            .collect(),
2153    }
2154}
2155
2156pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2157    let mut failures = Vec::new();
2158    if run.status != fixture.expected_status {
2159        failures.push(format!(
2160            "run status mismatch: expected {}, got {}",
2161            fixture.expected_status, run.status
2162        ));
2163    }
2164    for assertion in &fixture.stage_assertions {
2165        let Some(stage) = run
2166            .stages
2167            .iter()
2168            .find(|stage| stage.node_id == assertion.node_id)
2169        else {
2170            failures.push(format!("missing stage {}", assertion.node_id));
2171            continue;
2172        };
2173        if stage.status != assertion.expected_status {
2174            failures.push(format!(
2175                "stage {} status mismatch: expected {}, got {}",
2176                assertion.node_id, assertion.expected_status, stage.status
2177            ));
2178        }
2179        if stage.outcome != assertion.expected_outcome {
2180            failures.push(format!(
2181                "stage {} outcome mismatch: expected {}, got {}",
2182                assertion.node_id, assertion.expected_outcome, stage.outcome
2183            ));
2184        }
2185        if stage.branch != assertion.expected_branch {
2186            failures.push(format!(
2187                "stage {} branch mismatch: expected {:?}, got {:?}",
2188                assertion.node_id, assertion.expected_branch, stage.branch
2189            ));
2190        }
2191        for required_kind in &assertion.required_artifact_kinds {
2192            if !stage
2193                .artifacts
2194                .iter()
2195                .any(|artifact| &artifact.kind == required_kind)
2196            {
2197                failures.push(format!(
2198                    "stage {} missing artifact kind {}",
2199                    assertion.node_id, required_kind
2200                ));
2201            }
2202        }
2203        if let Some(snippet) = &assertion.visible_text_contains {
2204            let actual = stage.visible_text.clone().unwrap_or_default();
2205            if !actual.contains(snippet) {
2206                failures.push(format!(
2207                    "stage {} visible text does not contain expected snippet {:?}",
2208                    assertion.node_id, snippet
2209                ));
2210            }
2211        }
2212    }
2213
2214    ReplayEvalReport {
2215        pass: failures.is_empty(),
2216        failures,
2217        stage_count: run.stages.len(),
2218    }
2219}
2220
2221pub fn evaluate_run_suite(
2222    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2223) -> ReplayEvalSuiteReport {
2224    let mut reports = Vec::new();
2225    for (run, fixture, source_path) in cases {
2226        let report = evaluate_run_against_fixture(&run, &fixture);
2227        reports.push(ReplayEvalCaseReport {
2228            run_id: run.id.clone(),
2229            workflow_id: run.workflow_id.clone(),
2230            label: None,
2231            pass: report.pass,
2232            failures: report.failures,
2233            stage_count: report.stage_count,
2234            source_path,
2235            comparison: None,
2236        });
2237    }
2238    let total = reports.len();
2239    let passed = reports.iter().filter(|report| report.pass).count();
2240    let failed = total.saturating_sub(passed);
2241    ReplayEvalSuiteReport {
2242        pass: failed == 0,
2243        total,
2244        passed,
2245        failed,
2246        cases: reports,
2247    }
2248}
2249
2250pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2251    let mut stage_diffs = Vec::new();
2252    let mut all_node_ids = BTreeSet::new();
2253    all_node_ids.extend(left.stages.iter().map(|stage| stage.node_id.clone()));
2254    all_node_ids.extend(right.stages.iter().map(|stage| stage.node_id.clone()));
2255
2256    for node_id in all_node_ids {
2257        let left_stage = left.stages.iter().find(|stage| stage.node_id == node_id);
2258        let right_stage = right.stages.iter().find(|stage| stage.node_id == node_id);
2259        match (left_stage, right_stage) {
2260            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2261                node_id,
2262                change: "removed".to_string(),
2263                details: vec!["stage missing from right run".to_string()],
2264            }),
2265            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2266                node_id,
2267                change: "added".to_string(),
2268                details: vec!["stage missing from left run".to_string()],
2269            }),
2270            (Some(left_stage), Some(right_stage)) => {
2271                let mut details = Vec::new();
2272                if left_stage.status != right_stage.status {
2273                    details.push(format!(
2274                        "status: {} -> {}",
2275                        left_stage.status, right_stage.status
2276                    ));
2277                }
2278                if left_stage.outcome != right_stage.outcome {
2279                    details.push(format!(
2280                        "outcome: {} -> {}",
2281                        left_stage.outcome, right_stage.outcome
2282                    ));
2283                }
2284                if left_stage.branch != right_stage.branch {
2285                    details.push(format!(
2286                        "branch: {:?} -> {:?}",
2287                        left_stage.branch, right_stage.branch
2288                    ));
2289                }
2290                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2291                {
2292                    details.push(format!(
2293                        "produced_artifacts: {} -> {}",
2294                        left_stage.produced_artifact_ids.len(),
2295                        right_stage.produced_artifact_ids.len()
2296                    ));
2297                }
2298                if left_stage.artifacts.len() != right_stage.artifacts.len() {
2299                    details.push(format!(
2300                        "artifact_records: {} -> {}",
2301                        left_stage.artifacts.len(),
2302                        right_stage.artifacts.len()
2303                    ));
2304                }
2305                if !details.is_empty() {
2306                    stage_diffs.push(RunStageDiffRecord {
2307                        node_id,
2308                        change: "changed".to_string(),
2309                        details,
2310                    });
2311                }
2312            }
2313            (None, None) => {}
2314        }
2315    }
2316
2317    let status_changed = left.status != right.status;
2318    let identical = !status_changed
2319        && stage_diffs.is_empty()
2320        && left.transitions.len() == right.transitions.len()
2321        && left.artifacts.len() == right.artifacts.len()
2322        && left.checkpoints.len() == right.checkpoints.len();
2323
2324    RunDiffReport {
2325        left_run_id: left.id.clone(),
2326        right_run_id: right.id.clone(),
2327        identical,
2328        status_changed,
2329        left_status: left.status.clone(),
2330        right_status: right.status.clone(),
2331        stage_diffs,
2332        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
2333        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
2334        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
2335    }
2336}
2337
2338pub fn push_execution_policy(policy: CapabilityPolicy) {
2339    EXECUTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
2340}
2341
2342pub fn pop_execution_policy() {
2343    EXECUTION_POLICY_STACK.with(|stack| {
2344        stack.borrow_mut().pop();
2345    });
2346}
2347
2348pub fn current_execution_policy() -> Option<CapabilityPolicy> {
2349    EXECUTION_POLICY_STACK.with(|stack| stack.borrow().last().cloned())
2350}
2351
2352pub fn current_tool_metadata(tool: &str) -> Option<ToolRuntimePolicyMetadata> {
2353    current_execution_policy().and_then(|policy| policy.tool_metadata.get(tool).cloned())
2354}
2355
2356fn policy_allows_tool(policy: &CapabilityPolicy, tool: &str) -> bool {
2357    policy.tools.is_empty() || policy.tools.iter().any(|allowed| allowed == tool)
2358}
2359
2360fn policy_allows_capability(policy: &CapabilityPolicy, capability: &str, op: &str) -> bool {
2361    policy.capabilities.is_empty()
2362        || policy
2363            .capabilities
2364            .get(capability)
2365            .is_some_and(|ops| ops.is_empty() || ops.iter().any(|allowed| allowed == op))
2366}
2367
2368fn policy_allows_side_effect(policy: &CapabilityPolicy, requested: &str) -> bool {
2369    fn rank(v: &str) -> usize {
2370        match v {
2371            "none" => 0,
2372            "read_only" => 1,
2373            "workspace_write" => 2,
2374            "process_exec" => 3,
2375            "network" => 4,
2376            _ => 5,
2377        }
2378    }
2379    policy
2380        .side_effect_level
2381        .as_ref()
2382        .map(|allowed| rank(allowed) >= rank(requested))
2383        .unwrap_or(true)
2384}
2385
2386fn reject_policy(reason: String) -> Result<(), VmError> {
2387    Err(VmError::CategorizedError {
2388        message: reason,
2389        category: crate::value::ErrorCategory::ToolRejected,
2390    })
2391}
2392
2393fn fallback_mutation_classification(tool_name: &str) -> String {
2394    let lower = tool_name.to_ascii_lowercase();
2395    if lower.starts_with("mcp_") {
2396        return "host_defined".to_string();
2397    }
2398    if lower == "exec"
2399        || lower == "shell"
2400        || lower == "exec_at"
2401        || lower == "shell_at"
2402        || lower == "run"
2403        || lower.starts_with("run_")
2404    {
2405        return "ambient_side_effect".to_string();
2406    }
2407    if lower.starts_with("delete")
2408        || lower.starts_with("remove")
2409        || lower.starts_with("move")
2410        || lower.starts_with("rename")
2411    {
2412        return "destructive".to_string();
2413    }
2414    if lower.contains("write")
2415        || lower.contains("edit")
2416        || lower.contains("patch")
2417        || lower.contains("create")
2418        || lower.contains("scaffold")
2419        || lower.starts_with("insert")
2420        || lower.starts_with("replace")
2421        || lower == "add_import"
2422    {
2423        return "apply_workspace".to_string();
2424    }
2425    "read_only".to_string()
2426}
2427
2428pub fn current_tool_mutation_classification(tool_name: &str) -> String {
2429    current_tool_metadata(tool_name)
2430        .and_then(|metadata| metadata.mutation_classification)
2431        .unwrap_or_else(|| fallback_mutation_classification(tool_name))
2432}
2433
2434pub fn current_tool_declared_paths(tool_name: &str, args: &serde_json::Value) -> Vec<String> {
2435    let Some(map) = args.as_object() else {
2436        return Vec::new();
2437    };
2438    let path_keys = current_tool_metadata(tool_name)
2439        .map(|metadata| metadata.path_params)
2440        .filter(|keys| !keys.is_empty())
2441        .unwrap_or_else(|| {
2442            vec![
2443                "path".to_string(),
2444                "file".to_string(),
2445                "cwd".to_string(),
2446                "repo".to_string(),
2447                "target".to_string(),
2448                "destination".to_string(),
2449            ]
2450        });
2451    let mut paths = Vec::new();
2452    for key in path_keys {
2453        if let Some(value) = map.get(&key).and_then(|value| value.as_str()) {
2454            if !value.is_empty() {
2455                paths.push(value.to_string());
2456            }
2457        }
2458    }
2459    if let Some(items) = map.get("paths").and_then(|value| value.as_array()) {
2460        for item in items {
2461            if let Some(value) = item.as_str() {
2462                if !value.is_empty() {
2463                    paths.push(value.to_string());
2464                }
2465            }
2466        }
2467    }
2468    paths.sort();
2469    paths.dedup();
2470    paths
2471}
2472
2473pub fn enforce_current_policy_for_builtin(name: &str, args: &[VmValue]) -> Result<(), VmError> {
2474    let Some(policy) = current_execution_policy() else {
2475        return Ok(());
2476    };
2477    match name {
2478        "read" | "read_file" => {
2479            if !policy_allows_tool(&policy, name)
2480                || !policy_allows_capability(&policy, "workspace", "read_text")
2481            {
2482                return reject_policy(format!(
2483                    "builtin '{name}' exceeds workspace.read_text ceiling"
2484                ));
2485            }
2486        }
2487        "search" | "list_dir" => {
2488            if !policy_allows_tool(&policy, name)
2489                || !policy_allows_capability(&policy, "workspace", "list")
2490            {
2491                return reject_policy(format!("builtin '{name}' exceeds workspace.list ceiling"));
2492            }
2493        }
2494        "file_exists" | "stat" => {
2495            if !policy_allows_capability(&policy, "workspace", "exists") {
2496                return reject_policy(format!("builtin '{name}' exceeds workspace.exists ceiling"));
2497            }
2498        }
2499        "edit" | "write_file" | "append_file" | "mkdir" | "copy_file" => {
2500            if !policy_allows_tool(&policy, "edit")
2501                || !policy_allows_capability(&policy, "workspace", "write_text")
2502                || !policy_allows_side_effect(&policy, "workspace_write")
2503            {
2504                return reject_policy(format!("builtin '{name}' exceeds workspace write ceiling"));
2505            }
2506        }
2507        "delete_file" => {
2508            if !policy_allows_capability(&policy, "workspace", "delete")
2509                || !policy_allows_side_effect(&policy, "workspace_write")
2510            {
2511                return reject_policy(
2512                    "builtin 'delete_file' exceeds workspace.delete ceiling".to_string(),
2513                );
2514            }
2515        }
2516        "apply_edit" => {
2517            if !policy_allows_capability(&policy, "workspace", "apply_edit")
2518                || !policy_allows_side_effect(&policy, "workspace_write")
2519            {
2520                return reject_policy(
2521                    "builtin 'apply_edit' exceeds workspace.apply_edit ceiling".to_string(),
2522                );
2523            }
2524        }
2525        "exec" | "exec_at" | "shell" | "shell_at" | "run_command" => {
2526            if !policy_allows_tool(&policy, "run")
2527                || !policy_allows_capability(&policy, "process", "exec")
2528                || !policy_allows_side_effect(&policy, "process_exec")
2529            {
2530                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2531            }
2532        }
2533        "http_get" | "http_post" | "http_put" | "http_patch" | "http_delete" | "http_request" => {
2534            if !policy_allows_side_effect(&policy, "network") {
2535                return reject_policy(format!("builtin '{name}' exceeds network ceiling"));
2536            }
2537        }
2538        "mcp_connect"
2539        | "mcp_call"
2540        | "mcp_list_tools"
2541        | "mcp_list_resources"
2542        | "mcp_list_resource_templates"
2543        | "mcp_read_resource"
2544        | "mcp_list_prompts"
2545        | "mcp_get_prompt"
2546        | "mcp_server_info"
2547        | "mcp_disconnect" => {
2548            if !policy_allows_tool(&policy, "run")
2549                || !policy_allows_capability(&policy, "process", "exec")
2550                || !policy_allows_side_effect(&policy, "process_exec")
2551            {
2552                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2553            }
2554        }
2555        "host_invoke" => {
2556            let capability = args.first().map(|v| v.display()).unwrap_or_default();
2557            let op = args.get(1).map(|v| v.display()).unwrap_or_default();
2558            if !policy_allows_capability(&policy, &capability, &op) {
2559                return reject_policy(format!(
2560                    "host_invoke {capability}.{op} exceeds capability ceiling"
2561                ));
2562            }
2563            let requested_side_effect = match (capability.as_str(), op.as_str()) {
2564                ("workspace", "write_text" | "apply_edit" | "delete") => "workspace_write",
2565                ("process", "exec") => "process_exec",
2566                _ => "read_only",
2567            };
2568            if !policy_allows_side_effect(&policy, requested_side_effect) {
2569                return reject_policy(format!(
2570                    "host_invoke {capability}.{op} exceeds side-effect ceiling"
2571                ));
2572            }
2573        }
2574        _ => {}
2575    }
2576    Ok(())
2577}
2578
2579pub fn enforce_current_policy_for_bridge_builtin(name: &str) -> Result<(), VmError> {
2580    if current_execution_policy().is_some() {
2581        return reject_policy(format!(
2582            "bridged builtin '{name}' exceeds execution policy; declare an explicit capability/tool surface instead"
2583        ));
2584    }
2585    Ok(())
2586}
2587
2588pub fn enforce_current_policy_for_tool(tool_name: &str) -> Result<(), VmError> {
2589    let Some(policy) = current_execution_policy() else {
2590        return Ok(());
2591    };
2592    if !policy_allows_tool(&policy, tool_name) {
2593        return reject_policy(format!("tool '{tool_name}' exceeds tool ceiling"));
2594    }
2595    if let Some(metadata) = policy.tool_metadata.get(tool_name) {
2596        for (capability, ops) in &metadata.capabilities {
2597            for op in ops {
2598                if !policy_allows_capability(&policy, capability, op) {
2599                    return reject_policy(format!(
2600                        "tool '{tool_name}' exceeds capability ceiling: {capability}.{op}"
2601                    ));
2602                }
2603            }
2604        }
2605        if let Some(side_effect_level) = metadata.side_effect_level.as_deref() {
2606            if !policy_allows_side_effect(&policy, side_effect_level) {
2607                return reject_policy(format!(
2608                    "tool '{tool_name}' exceeds side-effect ceiling: {side_effect_level}"
2609                ));
2610            }
2611        }
2612    }
2613    Ok(())
2614}
2615
2616fn compact_transcript(transcript: &VmValue, keep_last: usize) -> Option<VmValue> {
2617    let dict = transcript.as_dict()?;
2618    let messages = match dict.get("messages") {
2619        Some(VmValue::List(list)) => list.iter().cloned().collect::<Vec<_>>(),
2620        _ => Vec::new(),
2621    };
2622    let retained = messages
2623        .into_iter()
2624        .rev()
2625        .take(keep_last)
2626        .collect::<Vec<_>>()
2627        .into_iter()
2628        .rev()
2629        .collect::<Vec<_>>();
2630    let mut compacted = dict.clone();
2631    compacted.insert(
2632        "messages".to_string(),
2633        VmValue::List(Rc::new(retained.clone())),
2634    );
2635    compacted.insert(
2636        "events".to_string(),
2637        VmValue::List(Rc::new(
2638            crate::llm::helpers::transcript_events_from_messages(&retained),
2639        )),
2640    );
2641    Some(VmValue::Dict(Rc::new(compacted)))
2642}
2643
2644fn redact_transcript_visibility(transcript: &VmValue, visibility: Option<&str>) -> Option<VmValue> {
2645    let Some(visibility) = visibility else {
2646        return Some(transcript.clone());
2647    };
2648    if visibility != "public" && visibility != "public_only" {
2649        return Some(transcript.clone());
2650    }
2651    let dict = transcript.as_dict()?;
2652    let public_messages = match dict.get("messages") {
2653        Some(VmValue::List(list)) => list
2654            .iter()
2655            .filter(|message| {
2656                message
2657                    .as_dict()
2658                    .and_then(|d| d.get("role"))
2659                    .map(|v| v.display())
2660                    .map(|role| role != "tool_result")
2661                    .unwrap_or(true)
2662            })
2663            .cloned()
2664            .collect::<Vec<_>>(),
2665        _ => Vec::new(),
2666    };
2667    let public_events = match dict.get("events") {
2668        Some(VmValue::List(list)) => list
2669            .iter()
2670            .filter(|event| {
2671                event
2672                    .as_dict()
2673                    .and_then(|d| d.get("visibility"))
2674                    .map(|v| v.display())
2675                    .map(|value| value == "public")
2676                    .unwrap_or(true)
2677            })
2678            .cloned()
2679            .collect::<Vec<_>>(),
2680        _ => Vec::new(),
2681    };
2682    let mut redacted = dict.clone();
2683    redacted.insert(
2684        "messages".to_string(),
2685        VmValue::List(Rc::new(public_messages)),
2686    );
2687    redacted.insert("events".to_string(), VmValue::List(Rc::new(public_events)));
2688    Some(VmValue::Dict(Rc::new(redacted)))
2689}
2690
2691pub(crate) fn apply_input_transcript_policy(
2692    transcript: Option<VmValue>,
2693    policy: &TranscriptPolicy,
2694) -> Option<VmValue> {
2695    let mut transcript = transcript;
2696    match policy.mode.as_deref() {
2697        Some("reset") => return None,
2698        Some("fork") => {
2699            if let Some(VmValue::Dict(dict)) = transcript.as_ref() {
2700                let mut forked = dict.as_ref().clone();
2701                forked.insert(
2702                    "id".to_string(),
2703                    VmValue::String(Rc::from(new_id("transcript"))),
2704                );
2705                transcript = Some(VmValue::Dict(Rc::new(forked)));
2706            }
2707        }
2708        _ => {}
2709    }
2710    if policy.compact {
2711        let keep_last = policy.keep_last.unwrap_or(6);
2712        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2713    }
2714    transcript
2715}
2716
2717fn apply_output_transcript_policy(
2718    transcript: Option<VmValue>,
2719    policy: &TranscriptPolicy,
2720) -> Option<VmValue> {
2721    let mut transcript = transcript;
2722    if policy.compact {
2723        let keep_last = policy.keep_last.unwrap_or(6);
2724        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2725    }
2726    transcript.and_then(|value| redact_transcript_visibility(&value, policy.visibility.as_deref()))
2727}
2728
2729pub async fn execute_stage_node(
2730    node_id: &str,
2731    node: &WorkflowNode,
2732    task: &str,
2733    artifacts: &[ArtifactRecord],
2734    transcript: Option<VmValue>,
2735) -> Result<(serde_json::Value, Vec<ArtifactRecord>, Option<VmValue>), VmError> {
2736    let mut selection_policy = node.context_policy.clone();
2737    if selection_policy.include_kinds.is_empty() && !node.input_contract.input_kinds.is_empty() {
2738        selection_policy.include_kinds = node.input_contract.input_kinds.clone();
2739    }
2740    let selected = select_artifacts_adaptive(artifacts.to_vec(), &selection_policy);
2741    let rendered_context = render_artifacts_context(&selected, &node.context_policy);
2742    let transcript = apply_input_transcript_policy(transcript, &node.transcript_policy);
2743    if node.input_contract.require_transcript && transcript.is_none() {
2744        return Err(VmError::Runtime(format!(
2745            "workflow stage {node_id} requires transcript input"
2746        )));
2747    }
2748    if let Some(min_inputs) = node.input_contract.min_inputs {
2749        if selected.len() < min_inputs {
2750            return Err(VmError::Runtime(format!(
2751                "workflow stage {node_id} requires at least {min_inputs} input artifacts"
2752            )));
2753        }
2754    }
2755    if let Some(max_inputs) = node.input_contract.max_inputs {
2756        if selected.len() > max_inputs {
2757            return Err(VmError::Runtime(format!(
2758                "workflow stage {node_id} accepts at most {max_inputs} input artifacts"
2759            )));
2760        }
2761    }
2762    let prompt = if rendered_context.is_empty() {
2763        task.to_string()
2764    } else {
2765        format!(
2766            "{rendered_context}\n\n{}:\n{task}",
2767            node.task_label
2768                .clone()
2769                .unwrap_or_else(|| "Task".to_string())
2770        )
2771    };
2772
2773    let tool_format = std::env::var("HARN_AGENT_TOOL_FORMAT")
2774        .ok()
2775        .filter(|value| !value.trim().is_empty())
2776        .unwrap_or_else(|| "text".to_string());
2777    let mut llm_result = if node.kind == "verify" {
2778        if let Some(command) = node
2779            .verify
2780            .as_ref()
2781            .and_then(|verify| verify.as_object())
2782            .and_then(|verify| verify.get("command"))
2783            .and_then(|value| value.as_str())
2784            .map(str::trim)
2785            .filter(|value| !value.is_empty())
2786        {
2787            let mut process = if cfg!(target_os = "windows") {
2788                let mut cmd = tokio::process::Command::new("cmd");
2789                cmd.arg("/C").arg(command);
2790                cmd
2791            } else {
2792                let mut cmd = tokio::process::Command::new("/bin/sh");
2793                cmd.arg("-lc").arg(command);
2794                cmd
2795            };
2796            process.stdin(std::process::Stdio::null());
2797            if let Some(context) = crate::stdlib::process::current_execution_context() {
2798                if let Some(cwd) = context.cwd.filter(|cwd| !cwd.is_empty()) {
2799                    process.current_dir(cwd);
2800                }
2801                if !context.env.is_empty() {
2802                    process.envs(context.env);
2803                }
2804            }
2805            let output = process
2806                .output()
2807                .await
2808                .map_err(|e| VmError::Runtime(format!("workflow verify exec failed: {e}")))?;
2809            let stdout = String::from_utf8_lossy(&output.stdout).to_string();
2810            let stderr = String::from_utf8_lossy(&output.stderr).to_string();
2811            let combined = if stderr.is_empty() {
2812                stdout.clone()
2813            } else if stdout.is_empty() {
2814                stderr.clone()
2815            } else {
2816                format!("{stdout}\n{stderr}")
2817            };
2818            serde_json::json!({
2819                "status": "completed",
2820                "text": combined,
2821                "visible_text": combined,
2822                "command": command,
2823                "stdout": stdout,
2824                "stderr": stderr,
2825                "exit_status": output.status.code().unwrap_or(-1),
2826                "success": output.status.success(),
2827            })
2828        } else {
2829            serde_json::json!({
2830                "status": "completed",
2831                "text": "",
2832                "visible_text": "",
2833            })
2834        }
2835    } else {
2836        let mut options = BTreeMap::new();
2837        if let Some(provider) = &node.model_policy.provider {
2838            options.insert(
2839                "provider".to_string(),
2840                VmValue::String(Rc::from(provider.clone())),
2841            );
2842        }
2843        if let Some(model) = &node.model_policy.model {
2844            options.insert(
2845                "model".to_string(),
2846                VmValue::String(Rc::from(model.clone())),
2847            );
2848        }
2849        if let Some(model_tier) = &node.model_policy.model_tier {
2850            options.insert(
2851                "model_tier".to_string(),
2852                VmValue::String(Rc::from(model_tier.clone())),
2853            );
2854        }
2855        if let Some(temperature) = node.model_policy.temperature {
2856            options.insert("temperature".to_string(), VmValue::Float(temperature));
2857        }
2858        if let Some(max_tokens) = node.model_policy.max_tokens {
2859            options.insert("max_tokens".to_string(), VmValue::Int(max_tokens));
2860        }
2861        let tool_names = workflow_tool_names(&node.tools);
2862        if !matches!(node.tools, serde_json::Value::Null) && !tool_names.is_empty() {
2863            options.insert(
2864                "tools".to_string(),
2865                crate::stdlib::json_to_vm_value(&node.tools),
2866            );
2867        }
2868        if let Some(transcript) = transcript.clone() {
2869            options.insert("transcript".to_string(), transcript);
2870        }
2871
2872        let args = vec![
2873            VmValue::String(Rc::from(prompt.clone())),
2874            node.system
2875                .clone()
2876                .map(|s| VmValue::String(Rc::from(s)))
2877                .unwrap_or(VmValue::Nil),
2878            VmValue::Dict(Rc::new(options)),
2879        ];
2880        let mut opts = extract_llm_options(&args)?;
2881
2882        if node.mode.as_deref() == Some("agent") || !tool_names.is_empty() {
2883            crate::llm::run_agent_loop_internal(
2884                &mut opts,
2885                crate::llm::AgentLoopConfig {
2886                    persistent: true,
2887                    max_iterations: 12,
2888                    max_nudges: 3,
2889                    nudge: None,
2890                    done_sentinel: node.done_sentinel.clone(),
2891                    tool_retries: 0,
2892                    tool_backoff_ms: 1000,
2893                    tool_format: tool_format.clone(),
2894                    auto_compact: None,
2895                    policy: None,
2896                    daemon: false,
2897                    llm_retries: 2,
2898                    llm_backoff_ms: 2000,
2899                },
2900            )
2901            .await?
2902        } else {
2903            let result = vm_call_llm_full(&opts).await?;
2904            crate::llm::agent_loop_result_from_llm(&result, opts)
2905        }
2906    };
2907    if let Some(payload) = llm_result.as_object_mut() {
2908        payload.insert("prompt".to_string(), serde_json::json!(prompt));
2909        payload.insert(
2910            "system_prompt".to_string(),
2911            serde_json::json!(node.system.clone().unwrap_or_default()),
2912        );
2913        payload.insert(
2914            "rendered_context".to_string(),
2915            serde_json::json!(rendered_context),
2916        );
2917        payload.insert(
2918            "selected_artifact_ids".to_string(),
2919            serde_json::json!(selected
2920                .iter()
2921                .map(|artifact| artifact.id.clone())
2922                .collect::<Vec<_>>()),
2923        );
2924        payload.insert(
2925            "selected_artifact_titles".to_string(),
2926            serde_json::json!(selected
2927                .iter()
2928                .map(|artifact| artifact.title.clone())
2929                .collect::<Vec<_>>()),
2930        );
2931        payload.insert(
2932            "tool_calling_mode".to_string(),
2933            serde_json::json!(tool_format.clone()),
2934        );
2935    }
2936
2937    let visible_text = llm_result["text"].as_str().unwrap_or_default().to_string();
2938    let transcript = llm_result
2939        .get("transcript")
2940        .cloned()
2941        .map(|value| crate::stdlib::json_to_vm_value(&value));
2942    let transcript = apply_output_transcript_policy(transcript, &node.transcript_policy);
2943    let output_kind = node
2944        .output_contract
2945        .output_kinds
2946        .first()
2947        .cloned()
2948        .unwrap_or_else(|| {
2949            if node.kind == "verify" {
2950                "verification_result".to_string()
2951            } else {
2952                "artifact".to_string()
2953            }
2954        });
2955    let mut metadata = BTreeMap::new();
2956    metadata.insert(
2957        "input_artifact_ids".to_string(),
2958        serde_json::json!(selected
2959            .iter()
2960            .map(|artifact| artifact.id.clone())
2961            .collect::<Vec<_>>()),
2962    );
2963    metadata.insert("node_kind".to_string(), serde_json::json!(node.kind));
2964    let artifact = ArtifactRecord {
2965        type_name: "artifact".to_string(),
2966        id: new_id("artifact"),
2967        kind: output_kind,
2968        title: Some(format!("stage {node_id} output")),
2969        text: Some(visible_text),
2970        data: Some(llm_result.clone()),
2971        source: Some(node_id.to_string()),
2972        created_at: now_rfc3339(),
2973        freshness: Some("fresh".to_string()),
2974        priority: None,
2975        lineage: selected
2976            .iter()
2977            .map(|artifact| artifact.id.clone())
2978            .collect(),
2979        relevance: Some(1.0),
2980        estimated_tokens: None,
2981        stage: Some(node_id.to_string()),
2982        metadata,
2983    }
2984    .normalize();
2985
2986    Ok((llm_result, vec![artifact], transcript))
2987}
2988
2989pub fn next_nodes_for(
2990    graph: &WorkflowGraph,
2991    current: &str,
2992    branch: Option<&str>,
2993) -> Vec<WorkflowEdge> {
2994    let mut matching: Vec<WorkflowEdge> = graph
2995        .edges
2996        .iter()
2997        .filter(|edge| edge.from == current && edge.branch.as_deref() == branch)
2998        .cloned()
2999        .collect();
3000    if matching.is_empty() {
3001        matching = graph
3002            .edges
3003            .iter()
3004            .filter(|edge| edge.from == current && edge.branch.is_none())
3005            .cloned()
3006            .collect();
3007    }
3008    matching
3009}
3010
3011pub fn next_node_for(graph: &WorkflowGraph, current: &str, branch: &str) -> Option<String> {
3012    next_nodes_for(graph, current, Some(branch))
3013        .into_iter()
3014        .next()
3015        .map(|edge| edge.to)
3016}
3017
3018pub fn append_audit_entry(
3019    graph: &mut WorkflowGraph,
3020    op: &str,
3021    node_id: Option<String>,
3022    reason: Option<String>,
3023    metadata: BTreeMap<String, serde_json::Value>,
3024) {
3025    graph.audit_log.push(WorkflowAuditEntry {
3026        id: new_id("audit"),
3027        op: op.to_string(),
3028        node_id,
3029        timestamp: now_rfc3339(),
3030        reason,
3031        metadata,
3032    });
3033}
3034
3035pub fn builtin_ceiling() -> CapabilityPolicy {
3036    CapabilityPolicy {
3037        // Runtime-owned ceiling is capability-based, not product-tool-based.
3038        // Integrators define concrete tool surfaces in workflow graphs / registries.
3039        tools: Vec::new(),
3040        capabilities: BTreeMap::from([
3041            (
3042                "workspace".to_string(),
3043                vec![
3044                    "read_text".to_string(),
3045                    "write_text".to_string(),
3046                    "apply_edit".to_string(),
3047                    "delete".to_string(),
3048                    "exists".to_string(),
3049                    "list".to_string(),
3050                ],
3051            ),
3052            ("process".to_string(), vec!["exec".to_string()]),
3053        ]),
3054        workspace_roots: Vec::new(),
3055        side_effect_level: Some("network".to_string()),
3056        recursion_limit: Some(8),
3057        tool_arg_constraints: Vec::new(),
3058        tool_metadata: BTreeMap::new(),
3059    }
3060}
3061
3062#[cfg(test)]
3063mod tests {
3064    use super::*;
3065
3066    #[test]
3067    fn capability_intersection_rejects_privilege_expansion() {
3068        let ceiling = CapabilityPolicy {
3069            tools: vec!["read".to_string()],
3070            side_effect_level: Some("read_only".to_string()),
3071            recursion_limit: Some(2),
3072            ..Default::default()
3073        };
3074        let requested = CapabilityPolicy {
3075            tools: vec!["read".to_string(), "edit".to_string()],
3076            ..Default::default()
3077        };
3078        let error = ceiling.intersect(&requested).unwrap_err();
3079        assert!(error.contains("host ceiling"));
3080    }
3081
3082    #[test]
3083    fn mutation_session_normalize_fills_defaults() {
3084        let normalized = MutationSessionRecord::default().normalize();
3085        assert!(normalized.session_id.starts_with("session_"));
3086        assert_eq!(normalized.mutation_scope, "read_only");
3087        assert_eq!(normalized.approval_mode, "host_enforced");
3088    }
3089
3090    #[test]
3091    fn install_current_mutation_session_round_trips() {
3092        install_current_mutation_session(Some(MutationSessionRecord {
3093            session_id: "session_test".to_string(),
3094            mutation_scope: "apply_workspace".to_string(),
3095            approval_mode: "explicit".to_string(),
3096            ..Default::default()
3097        }));
3098        let current = current_mutation_session().expect("session installed");
3099        assert_eq!(current.session_id, "session_test");
3100        assert_eq!(current.mutation_scope, "apply_workspace");
3101        assert_eq!(current.approval_mode, "explicit");
3102
3103        install_current_mutation_session(None);
3104        assert!(current_mutation_session().is_none());
3105    }
3106
3107    #[test]
3108    fn active_execution_policy_rejects_unknown_bridge_builtin() {
3109        push_execution_policy(CapabilityPolicy {
3110            tools: vec!["read".to_string()],
3111            capabilities: BTreeMap::from([(
3112                "workspace".to_string(),
3113                vec!["read_text".to_string()],
3114            )]),
3115            side_effect_level: Some("read_only".to_string()),
3116            recursion_limit: Some(1),
3117            ..Default::default()
3118        });
3119        let error = enforce_current_policy_for_bridge_builtin("custom_host_builtin").unwrap_err();
3120        pop_execution_policy();
3121        assert!(matches!(
3122            error,
3123            VmError::CategorizedError {
3124                category: crate::value::ErrorCategory::ToolRejected,
3125                ..
3126            }
3127        ));
3128    }
3129
3130    #[test]
3131    fn active_execution_policy_rejects_mcp_escape_hatch() {
3132        push_execution_policy(CapabilityPolicy {
3133            tools: vec!["read".to_string()],
3134            capabilities: BTreeMap::from([(
3135                "workspace".to_string(),
3136                vec!["read_text".to_string()],
3137            )]),
3138            side_effect_level: Some("read_only".to_string()),
3139            recursion_limit: Some(1),
3140            ..Default::default()
3141        });
3142        let error = enforce_current_policy_for_builtin("mcp_connect", &[]).unwrap_err();
3143        pop_execution_policy();
3144        assert!(matches!(
3145            error,
3146            VmError::CategorizedError {
3147                category: crate::value::ErrorCategory::ToolRejected,
3148                ..
3149            }
3150        ));
3151    }
3152
3153    #[test]
3154    fn workflow_normalization_upgrades_legacy_act_verify_repair_shape() {
3155        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3156            "name": "legacy",
3157            "act": {"mode": "llm"},
3158            "verify": {"kind": "verify"},
3159            "repair": {"mode": "agent"},
3160        }));
3161        let graph = normalize_workflow_value(&value).unwrap();
3162        assert_eq!(graph.type_name, "workflow_graph");
3163        assert!(graph.nodes.contains_key("act"));
3164        assert!(graph.nodes.contains_key("verify"));
3165        assert!(graph.nodes.contains_key("repair"));
3166        assert_eq!(graph.entry, "act");
3167    }
3168
3169    #[test]
3170    fn workflow_normalization_accepts_tool_registry_nodes() {
3171        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3172            "name": "registry_tools",
3173            "entry": "implement",
3174            "nodes": {
3175                "implement": {
3176                    "kind": "stage",
3177                    "mode": "agent",
3178                    "tools": {
3179                        "_type": "tool_registry",
3180                        "tools": [
3181                            {"name": "read", "description": "Read files"},
3182                            {"name": "run", "description": "Run commands"}
3183                        ]
3184                    }
3185                }
3186            },
3187            "edges": []
3188        }));
3189        let graph = normalize_workflow_value(&value).unwrap();
3190        let node = graph.nodes.get("implement").unwrap();
3191        assert_eq!(workflow_tool_names(&node.tools), vec!["read", "run"]);
3192    }
3193
3194    #[test]
3195    fn artifact_selection_honors_budget_and_priority() {
3196        let policy = ContextPolicy {
3197            max_artifacts: Some(2),
3198            max_tokens: Some(30),
3199            prefer_recent: true,
3200            prefer_fresh: true,
3201            prioritize_kinds: vec!["verification_result".to_string()],
3202            ..Default::default()
3203        };
3204        let artifacts = vec![
3205            ArtifactRecord {
3206                type_name: "artifact".to_string(),
3207                id: "a".to_string(),
3208                kind: "summary".to_string(),
3209                text: Some("short".to_string()),
3210                relevance: Some(0.9),
3211                created_at: now_rfc3339(),
3212                ..Default::default()
3213            }
3214            .normalize(),
3215            ArtifactRecord {
3216                type_name: "artifact".to_string(),
3217                id: "b".to_string(),
3218                kind: "summary".to_string(),
3219                text: Some("this is a much larger artifact body".to_string()),
3220                relevance: Some(1.0),
3221                created_at: now_rfc3339(),
3222                ..Default::default()
3223            }
3224            .normalize(),
3225            ArtifactRecord {
3226                type_name: "artifact".to_string(),
3227                id: "c".to_string(),
3228                kind: "summary".to_string(),
3229                text: Some("tiny".to_string()),
3230                relevance: Some(0.5),
3231                created_at: now_rfc3339(),
3232                ..Default::default()
3233            }
3234            .normalize(),
3235        ];
3236        let selected = select_artifacts(artifacts, &policy);
3237        assert_eq!(selected.len(), 2);
3238        assert!(selected.iter().all(|artifact| artifact.kind == "summary"));
3239    }
3240
3241    #[test]
3242    fn workflow_validation_rejects_condition_without_true_false_edges() {
3243        let graph = WorkflowGraph {
3244            entry: "gate".to_string(),
3245            nodes: BTreeMap::from([(
3246                "gate".to_string(),
3247                WorkflowNode {
3248                    id: Some("gate".to_string()),
3249                    kind: "condition".to_string(),
3250                    ..Default::default()
3251                },
3252            )]),
3253            edges: vec![WorkflowEdge {
3254                from: "gate".to_string(),
3255                to: "next".to_string(),
3256                branch: Some("true".to_string()),
3257                label: None,
3258            }],
3259            ..Default::default()
3260        };
3261        let report = validate_workflow(&graph, None);
3262        assert!(!report.valid);
3263        assert!(report
3264            .errors
3265            .iter()
3266            .any(|error| error.contains("true") && error.contains("false")));
3267    }
3268
3269    #[test]
3270    fn replay_fixture_round_trip_passes() {
3271        let run = RunRecord {
3272            type_name: "run_record".to_string(),
3273            id: "run_1".to_string(),
3274            workflow_id: "wf".to_string(),
3275            workflow_name: Some("demo".to_string()),
3276            task: "demo".to_string(),
3277            status: "completed".to_string(),
3278            started_at: "1".to_string(),
3279            finished_at: Some("2".to_string()),
3280            parent_run_id: None,
3281            root_run_id: Some("run_1".to_string()),
3282            stages: vec![RunStageRecord {
3283                id: "stage_1".to_string(),
3284                node_id: "act".to_string(),
3285                kind: "stage".to_string(),
3286                status: "completed".to_string(),
3287                outcome: "success".to_string(),
3288                branch: Some("success".to_string()),
3289                started_at: "1".to_string(),
3290                finished_at: Some("2".to_string()),
3291                visible_text: Some("done".to_string()),
3292                private_reasoning: None,
3293                transcript: None,
3294                verification: None,
3295                usage: None,
3296                artifacts: vec![ArtifactRecord {
3297                    type_name: "artifact".to_string(),
3298                    id: "a1".to_string(),
3299                    kind: "summary".to_string(),
3300                    text: Some("done".to_string()),
3301                    created_at: "1".to_string(),
3302                    ..Default::default()
3303                }
3304                .normalize()],
3305                consumed_artifact_ids: vec![],
3306                produced_artifact_ids: vec!["a1".to_string()],
3307                attempts: vec![],
3308                metadata: BTreeMap::new(),
3309            }],
3310            transitions: vec![],
3311            checkpoints: vec![],
3312            pending_nodes: vec![],
3313            completed_nodes: vec!["act".to_string()],
3314            child_runs: vec![],
3315            artifacts: vec![],
3316            policy: CapabilityPolicy::default(),
3317            execution: None,
3318            transcript: None,
3319            usage: None,
3320            replay_fixture: None,
3321            metadata: BTreeMap::new(),
3322            persisted_path: None,
3323        };
3324        let fixture = replay_fixture_from_run(&run);
3325        let report = evaluate_run_against_fixture(&run, &fixture);
3326        assert!(report.pass);
3327        assert!(report.failures.is_empty());
3328    }
3329
3330    #[test]
3331    fn replay_eval_suite_reports_failed_case() {
3332        let good = RunRecord {
3333            id: "run_good".to_string(),
3334            workflow_id: "wf".to_string(),
3335            status: "completed".to_string(),
3336            stages: vec![RunStageRecord {
3337                node_id: "act".to_string(),
3338                status: "completed".to_string(),
3339                outcome: "success".to_string(),
3340                ..Default::default()
3341            }],
3342            ..Default::default()
3343        };
3344        let bad = RunRecord {
3345            id: "run_bad".to_string(),
3346            workflow_id: "wf".to_string(),
3347            status: "failed".to_string(),
3348            stages: vec![RunStageRecord {
3349                node_id: "act".to_string(),
3350                status: "failed".to_string(),
3351                outcome: "error".to_string(),
3352                ..Default::default()
3353            }],
3354            ..Default::default()
3355        };
3356        let suite = evaluate_run_suite(vec![
3357            (
3358                good.clone(),
3359                replay_fixture_from_run(&good),
3360                Some("good.json".to_string()),
3361            ),
3362            (
3363                bad.clone(),
3364                replay_fixture_from_run(&good),
3365                Some("bad.json".to_string()),
3366            ),
3367        ]);
3368        assert!(!suite.pass);
3369        assert_eq!(suite.total, 2);
3370        assert_eq!(suite.failed, 1);
3371        assert!(suite.cases.iter().any(|case| !case.pass));
3372    }
3373
3374    #[test]
3375    fn run_diff_reports_changed_stage() {
3376        let left = RunRecord {
3377            id: "left".to_string(),
3378            workflow_id: "wf".to_string(),
3379            status: "completed".to_string(),
3380            stages: vec![RunStageRecord {
3381                node_id: "act".to_string(),
3382                status: "completed".to_string(),
3383                outcome: "success".to_string(),
3384                ..Default::default()
3385            }],
3386            ..Default::default()
3387        };
3388        let right = RunRecord {
3389            id: "right".to_string(),
3390            workflow_id: "wf".to_string(),
3391            status: "failed".to_string(),
3392            stages: vec![RunStageRecord {
3393                node_id: "act".to_string(),
3394                status: "failed".to_string(),
3395                outcome: "error".to_string(),
3396                ..Default::default()
3397            }],
3398            ..Default::default()
3399        };
3400        let diff = diff_run_records(&left, &right);
3401        assert!(diff.status_changed);
3402        assert!(!diff.identical);
3403        assert_eq!(diff.stage_diffs.len(), 1);
3404    }
3405
3406    #[test]
3407    fn eval_suite_manifest_can_fail_on_baseline_diff() {
3408        let temp_dir =
3409            std::env::temp_dir().join(format!("harn-eval-suite-{}", uuid::Uuid::now_v7()));
3410        std::fs::create_dir_all(&temp_dir).unwrap();
3411        let baseline_path = temp_dir.join("baseline.json");
3412        let candidate_path = temp_dir.join("candidate.json");
3413
3414        let baseline = RunRecord {
3415            id: "baseline".to_string(),
3416            workflow_id: "wf".to_string(),
3417            status: "completed".to_string(),
3418            stages: vec![RunStageRecord {
3419                node_id: "act".to_string(),
3420                status: "completed".to_string(),
3421                outcome: "success".to_string(),
3422                ..Default::default()
3423            }],
3424            ..Default::default()
3425        };
3426        let candidate = RunRecord {
3427            id: "candidate".to_string(),
3428            workflow_id: "wf".to_string(),
3429            status: "failed".to_string(),
3430            stages: vec![RunStageRecord {
3431                node_id: "act".to_string(),
3432                status: "failed".to_string(),
3433                outcome: "error".to_string(),
3434                ..Default::default()
3435            }],
3436            ..Default::default()
3437        };
3438
3439        save_run_record(&baseline, Some(baseline_path.to_str().unwrap())).unwrap();
3440        save_run_record(&candidate, Some(candidate_path.to_str().unwrap())).unwrap();
3441
3442        let manifest = EvalSuiteManifest {
3443            base_dir: Some(temp_dir.display().to_string()),
3444            cases: vec![EvalSuiteCase {
3445                label: Some("candidate".to_string()),
3446                run_path: "candidate.json".to_string(),
3447                fixture_path: None,
3448                compare_to: Some("baseline.json".to_string()),
3449            }],
3450            ..Default::default()
3451        };
3452        let suite = evaluate_run_suite_manifest(&manifest).unwrap();
3453        assert!(!suite.pass);
3454        assert_eq!(suite.failed, 1);
3455        assert!(suite.cases[0].comparison.is_some());
3456        assert!(suite.cases[0]
3457            .failures
3458            .iter()
3459            .any(|failure| failure.contains("baseline")));
3460    }
3461
3462    #[test]
3463    fn render_unified_diff_marks_removed_and_added_lines() {
3464        let diff = render_unified_diff(Some("src/main.rs"), "old\nsame", "new\nsame");
3465        assert!(diff.contains("--- a/src/main.rs"));
3466        assert!(diff.contains("+++ b/src/main.rs"));
3467        assert!(diff.contains("-old"));
3468        assert!(diff.contains("+new"));
3469        assert!(diff.contains(" same"));
3470    }
3471
3472    #[test]
3473    fn execution_policy_rejects_process_exec_when_read_only() {
3474        push_execution_policy(CapabilityPolicy {
3475            side_effect_level: Some("read_only".to_string()),
3476            capabilities: BTreeMap::from([("process".to_string(), vec!["exec".to_string()])]),
3477            ..Default::default()
3478        });
3479        let result = enforce_current_policy_for_builtin("exec", &[]);
3480        pop_execution_policy();
3481        assert!(result.is_err());
3482    }
3483
3484    #[test]
3485    fn execution_policy_rejects_unlisted_tool() {
3486        push_execution_policy(CapabilityPolicy {
3487            tools: vec!["read".to_string()],
3488            ..Default::default()
3489        });
3490        let result = enforce_current_policy_for_tool("edit");
3491        pop_execution_policy();
3492        assert!(result.is_err());
3493    }
3494
3495    // ── Tool hook tests ──────────────────────────────────────────────
3496
3497    #[test]
3498    fn pre_tool_hook_deny_blocks_execution() {
3499        clear_tool_hooks();
3500        register_tool_hook(ToolHook {
3501            pattern: "dangerous_*".to_string(),
3502            pre: Some(Rc::new(|_name, _args| {
3503                PreToolAction::Deny("blocked by policy".to_string())
3504            })),
3505            post: None,
3506        });
3507        let result = run_pre_tool_hooks("dangerous_delete", &serde_json::json!({}));
3508        clear_tool_hooks();
3509        assert!(matches!(result, PreToolAction::Deny(_)));
3510    }
3511
3512    #[test]
3513    fn pre_tool_hook_allow_passes_through() {
3514        clear_tool_hooks();
3515        register_tool_hook(ToolHook {
3516            pattern: "safe_*".to_string(),
3517            pre: Some(Rc::new(|_name, _args| PreToolAction::Allow)),
3518            post: None,
3519        });
3520        let result = run_pre_tool_hooks("safe_read", &serde_json::json!({}));
3521        clear_tool_hooks();
3522        assert!(matches!(result, PreToolAction::Allow));
3523    }
3524
3525    #[test]
3526    fn pre_tool_hook_modify_rewrites_args() {
3527        clear_tool_hooks();
3528        register_tool_hook(ToolHook {
3529            pattern: "*".to_string(),
3530            pre: Some(Rc::new(|_name, _args| {
3531                PreToolAction::Modify(serde_json::json!({"path": "/sanitized"}))
3532            })),
3533            post: None,
3534        });
3535        let result = run_pre_tool_hooks("read_file", &serde_json::json!({"path": "/etc/passwd"}));
3536        clear_tool_hooks();
3537        match result {
3538            PreToolAction::Modify(args) => assert_eq!(args["path"], "/sanitized"),
3539            _ => panic!("expected Modify"),
3540        }
3541    }
3542
3543    #[test]
3544    fn post_tool_hook_modifies_result() {
3545        clear_tool_hooks();
3546        register_tool_hook(ToolHook {
3547            pattern: "exec".to_string(),
3548            pre: None,
3549            post: Some(Rc::new(|_name, result| {
3550                if result.contains("SECRET") {
3551                    PostToolAction::Modify("[REDACTED]".to_string())
3552                } else {
3553                    PostToolAction::Pass
3554                }
3555            })),
3556        });
3557        let result = run_post_tool_hooks("exec", "output with SECRET data");
3558        let clean = run_post_tool_hooks("exec", "clean output");
3559        clear_tool_hooks();
3560        assert_eq!(result, "[REDACTED]");
3561        assert_eq!(clean, "clean output");
3562    }
3563
3564    #[test]
3565    fn unmatched_hook_pattern_does_not_fire() {
3566        clear_tool_hooks();
3567        register_tool_hook(ToolHook {
3568            pattern: "exec".to_string(),
3569            pre: Some(Rc::new(|_name, _args| {
3570                PreToolAction::Deny("should not match".to_string())
3571            })),
3572            post: None,
3573        });
3574        let result = run_pre_tool_hooks("read_file", &serde_json::json!({}));
3575        clear_tool_hooks();
3576        assert!(matches!(result, PreToolAction::Allow));
3577    }
3578
3579    #[test]
3580    fn glob_match_patterns() {
3581        assert!(glob_match("*", "anything"));
3582        assert!(glob_match("exec*", "exec_at"));
3583        assert!(glob_match("*_file", "read_file"));
3584        assert!(!glob_match("exec*", "read_file"));
3585        assert!(glob_match("read_file", "read_file"));
3586        assert!(!glob_match("read_file", "write_file"));
3587    }
3588
3589    // ── Auto-compaction tests ────────────────────────────────────────
3590
3591    #[test]
3592    fn microcompact_snips_large_output() {
3593        let large = "x".repeat(50_000);
3594        let result = microcompact_tool_output(&large, 10_000);
3595        assert!(result.len() < 15_000);
3596        assert!(result.contains("snipped"));
3597    }
3598
3599    #[test]
3600    fn microcompact_preserves_small_output() {
3601        let small = "hello world";
3602        let result = microcompact_tool_output(small, 10_000);
3603        assert_eq!(result, small);
3604    }
3605
3606    #[test]
3607    fn auto_compact_messages_reduces_count() {
3608        let mut messages: Vec<serde_json::Value> = (0..20)
3609            .map(|i| serde_json::json!({"role": "user", "content": format!("message {i}")}))
3610            .collect();
3611        let runtime = tokio::runtime::Builder::new_current_thread()
3612            .enable_all()
3613            .build()
3614            .unwrap();
3615        let compacted = runtime.block_on(auto_compact_messages(
3616            &mut messages,
3617            &AutoCompactConfig {
3618                compact_strategy: CompactStrategy::Truncate,
3619                keep_last: 6,
3620                ..Default::default()
3621            },
3622            None,
3623        ));
3624        assert!(compacted.unwrap());
3625        assert!(messages.len() <= 7); // 6 kept + 1 summary
3626        assert!(messages[0]["content"]
3627            .as_str()
3628            .unwrap()
3629            .contains("auto-compacted"));
3630    }
3631
3632    #[test]
3633    fn auto_compact_noop_when_under_threshold() {
3634        let mut messages: Vec<serde_json::Value> = (0..4)
3635            .map(|i| serde_json::json!({"role": "user", "content": format!("msg {i}")}))
3636            .collect();
3637        let runtime = tokio::runtime::Builder::new_current_thread()
3638            .enable_all()
3639            .build()
3640            .unwrap();
3641        let compacted = runtime.block_on(auto_compact_messages(
3642            &mut messages,
3643            &AutoCompactConfig {
3644                compact_strategy: CompactStrategy::Truncate,
3645                keep_last: 6,
3646                ..Default::default()
3647            },
3648            None,
3649        ));
3650        assert!(!compacted.unwrap());
3651        assert_eq!(messages.len(), 4);
3652    }
3653
3654    #[test]
3655    fn estimate_message_tokens_basic() {
3656        let messages = vec![
3657            serde_json::json!({"role": "user", "content": "a".repeat(400)}),
3658            serde_json::json!({"role": "assistant", "content": "b".repeat(400)}),
3659        ];
3660        let tokens = estimate_message_tokens(&messages);
3661        assert_eq!(tokens, 200); // 800 chars / 4
3662    }
3663
3664    // ── Artifact dedup and microcompaction tests ─────────────────────
3665
3666    #[test]
3667    fn dedup_artifacts_removes_duplicates() {
3668        let mut artifacts = vec![
3669            ArtifactRecord {
3670                id: "a1".to_string(),
3671                kind: "test".to_string(),
3672                text: Some("duplicate content".to_string()),
3673                ..Default::default()
3674            },
3675            ArtifactRecord {
3676                id: "a2".to_string(),
3677                kind: "test".to_string(),
3678                text: Some("duplicate content".to_string()),
3679                ..Default::default()
3680            },
3681            ArtifactRecord {
3682                id: "a3".to_string(),
3683                kind: "test".to_string(),
3684                text: Some("unique content".to_string()),
3685                ..Default::default()
3686            },
3687        ];
3688        dedup_artifacts(&mut artifacts);
3689        assert_eq!(artifacts.len(), 2);
3690    }
3691
3692    #[test]
3693    fn microcompact_artifact_snips_oversized() {
3694        let mut artifact = ArtifactRecord {
3695            id: "a1".to_string(),
3696            kind: "test".to_string(),
3697            text: Some("x".repeat(10_000)),
3698            estimated_tokens: Some(2_500),
3699            ..Default::default()
3700        };
3701        microcompact_artifact(&mut artifact, 500);
3702        assert!(artifact.text.as_ref().unwrap().len() < 5_000);
3703        assert_eq!(artifact.estimated_tokens, Some(500));
3704    }
3705
3706    // ── Tool argument constraint tests ───────────────────────────────
3707
3708    #[test]
3709    fn arg_constraint_allows_matching_pattern() {
3710        let policy = CapabilityPolicy {
3711            tool_arg_constraints: vec![ToolArgConstraint {
3712                tool: "exec".to_string(),
3713                arg_patterns: vec!["cargo *".to_string()],
3714            }],
3715            ..Default::default()
3716        };
3717        let result = enforce_tool_arg_constraints(
3718            &policy,
3719            "exec",
3720            &serde_json::json!({"command": "cargo test"}),
3721        );
3722        assert!(result.is_ok());
3723    }
3724
3725    #[test]
3726    fn arg_constraint_rejects_non_matching_pattern() {
3727        let policy = CapabilityPolicy {
3728            tool_arg_constraints: vec![ToolArgConstraint {
3729                tool: "exec".to_string(),
3730                arg_patterns: vec!["cargo *".to_string()],
3731            }],
3732            ..Default::default()
3733        };
3734        let result = enforce_tool_arg_constraints(
3735            &policy,
3736            "exec",
3737            &serde_json::json!({"command": "rm -rf /"}),
3738        );
3739        assert!(result.is_err());
3740    }
3741
3742    #[test]
3743    fn arg_constraint_ignores_unmatched_tool() {
3744        let policy = CapabilityPolicy {
3745            tool_arg_constraints: vec![ToolArgConstraint {
3746                tool: "exec".to_string(),
3747                arg_patterns: vec!["cargo *".to_string()],
3748            }],
3749            ..Default::default()
3750        };
3751        let result = enforce_tool_arg_constraints(
3752            &policy,
3753            "read_file",
3754            &serde_json::json!({"path": "/etc/passwd"}),
3755        );
3756        assert!(result.is_ok());
3757    }
3758}