Skip to main content

harn_vm/
orchestration.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3use std::rc::Rc;
4use std::{cell::RefCell, thread_local};
5
6use serde::{Deserialize, Serialize};
7
8use crate::llm::{extract_llm_options, vm_call_llm_full, vm_value_to_json};
9use crate::value::{VmError, VmValue};
10
11fn now_rfc3339() -> String {
12    use std::time::{SystemTime, UNIX_EPOCH};
13    let ts = SystemTime::now()
14        .duration_since(UNIX_EPOCH)
15        .unwrap_or_default()
16        .as_secs();
17    format!("{ts}")
18}
19
20fn new_id(prefix: &str) -> String {
21    format!("{prefix}_{}", uuid::Uuid::now_v7())
22}
23
24fn default_run_dir() -> PathBuf {
25    std::env::var("HARN_RUN_DIR")
26        .map(PathBuf::from)
27        .unwrap_or_else(|_| PathBuf::from(".harn-runs"))
28}
29
30thread_local! {
31    static EXECUTION_POLICY_STACK: RefCell<Vec<CapabilityPolicy>> = const { RefCell::new(Vec::new()) };
32    static TOOL_HOOKS: RefCell<Vec<ToolHook>> = const { RefCell::new(Vec::new()) };
33    static CURRENT_MUTATION_SESSION: RefCell<Option<MutationSessionRecord>> = const { RefCell::new(None) };
34}
35
36#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
37#[serde(default)]
38pub struct MutationSessionRecord {
39    pub session_id: String,
40    pub parent_session_id: Option<String>,
41    pub run_id: Option<String>,
42    pub worker_id: Option<String>,
43    pub execution_kind: Option<String>,
44    pub mutation_scope: String,
45    pub approval_mode: String,
46}
47
48impl MutationSessionRecord {
49    pub fn normalize(mut self) -> Self {
50        if self.session_id.is_empty() {
51            self.session_id = new_id("session");
52        }
53        if self.mutation_scope.is_empty() {
54            self.mutation_scope = "read_only".to_string();
55        }
56        if self.approval_mode.is_empty() {
57            self.approval_mode = "host_enforced".to_string();
58        }
59        self
60    }
61}
62
63pub fn install_current_mutation_session(session: Option<MutationSessionRecord>) {
64    CURRENT_MUTATION_SESSION.with(|slot| {
65        *slot.borrow_mut() = session.map(MutationSessionRecord::normalize);
66    });
67}
68
69pub fn current_mutation_session() -> Option<MutationSessionRecord> {
70    CURRENT_MUTATION_SESSION.with(|slot| slot.borrow().clone())
71}
72
73// ── Tool lifecycle hooks ──────────────────────────────────────────────
74
75/// Action returned by a PreToolUse hook.
76#[derive(Clone, Debug)]
77pub enum PreToolAction {
78    /// Allow the tool call to proceed unchanged.
79    Allow,
80    /// Deny the tool call with an explanation.
81    Deny(String),
82    /// Allow but replace the arguments.
83    Modify(serde_json::Value),
84}
85
86/// Action returned by a PostToolUse hook.
87#[derive(Clone, Debug)]
88pub enum PostToolAction {
89    /// Pass the result through unchanged.
90    Pass,
91    /// Replace the result text.
92    Modify(String),
93}
94
95/// Callback types for tool lifecycle hooks.
96pub type PreToolHookFn = Rc<dyn Fn(&str, &serde_json::Value) -> PreToolAction>;
97pub type PostToolHookFn = Rc<dyn Fn(&str, &str) -> PostToolAction>;
98
99/// A registered tool hook with a name pattern and callbacks.
100#[derive(Clone)]
101pub struct ToolHook {
102    /// Glob-style pattern matched against tool names (e.g. `"*"`, `"exec*"`, `"read_file"`).
103    pub pattern: String,
104    /// Called before tool execution. Return `Deny` to reject, `Modify` to rewrite args.
105    pub pre: Option<PreToolHookFn>,
106    /// Called after tool execution with the result text. Return `Modify` to rewrite.
107    pub post: Option<PostToolHookFn>,
108}
109
110impl std::fmt::Debug for ToolHook {
111    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112        f.debug_struct("ToolHook")
113            .field("pattern", &self.pattern)
114            .field("has_pre", &self.pre.is_some())
115            .field("has_post", &self.post.is_some())
116            .finish()
117    }
118}
119
120fn glob_match(pattern: &str, name: &str) -> bool {
121    if pattern == "*" {
122        return true;
123    }
124    if let Some(prefix) = pattern.strip_suffix('*') {
125        return name.starts_with(prefix);
126    }
127    if let Some(suffix) = pattern.strip_prefix('*') {
128        return name.ends_with(suffix);
129    }
130    pattern == name
131}
132
133pub fn register_tool_hook(hook: ToolHook) {
134    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().push(hook));
135}
136
137pub fn clear_tool_hooks() {
138    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().clear());
139}
140
141/// Run all matching PreToolUse hooks. Returns the final action.
142pub fn run_pre_tool_hooks(tool_name: &str, args: &serde_json::Value) -> PreToolAction {
143    TOOL_HOOKS.with(|hooks| {
144        let hooks = hooks.borrow();
145        let mut current_args = args.clone();
146        for hook in hooks.iter() {
147            if !glob_match(&hook.pattern, tool_name) {
148                continue;
149            }
150            if let Some(ref pre) = hook.pre {
151                match pre(tool_name, &current_args) {
152                    PreToolAction::Allow => {}
153                    PreToolAction::Deny(reason) => return PreToolAction::Deny(reason),
154                    PreToolAction::Modify(new_args) => {
155                        current_args = new_args;
156                    }
157                }
158            }
159        }
160        if current_args != *args {
161            PreToolAction::Modify(current_args)
162        } else {
163            PreToolAction::Allow
164        }
165    })
166}
167
168/// Run all matching PostToolUse hooks. Returns the (possibly modified) result.
169pub fn run_post_tool_hooks(tool_name: &str, result: &str) -> String {
170    TOOL_HOOKS.with(|hooks| {
171        let hooks = hooks.borrow();
172        let mut current = result.to_string();
173        for hook in hooks.iter() {
174            if !glob_match(&hook.pattern, tool_name) {
175                continue;
176            }
177            if let Some(ref post) = hook.post {
178                match post(tool_name, &current) {
179                    PostToolAction::Pass => {}
180                    PostToolAction::Modify(new_result) => {
181                        current = new_result;
182                    }
183                }
184            }
185        }
186        current
187    })
188}
189
190// ── Auto-compaction ───────────────────────────────────────────────────
191
192#[derive(Clone, Debug, PartialEq, Eq)]
193pub enum CompactStrategy {
194    Llm,
195    Truncate,
196    Custom,
197}
198
199pub fn parse_compact_strategy(value: &str) -> Result<CompactStrategy, VmError> {
200    match value {
201        "llm" => Ok(CompactStrategy::Llm),
202        "truncate" => Ok(CompactStrategy::Truncate),
203        "custom" => Ok(CompactStrategy::Custom),
204        other => Err(VmError::Runtime(format!(
205            "unknown compact_strategy '{other}' (expected 'llm', 'truncate', or 'custom')"
206        ))),
207    }
208}
209
210/// Configuration for automatic transcript compaction in agent loops.
211#[derive(Clone, Debug)]
212pub struct AutoCompactConfig {
213    /// Maximum estimated tokens before triggering auto-compaction.
214    pub token_threshold: usize,
215    /// Maximum character length for a single tool result before microcompaction.
216    pub tool_output_max_chars: usize,
217    /// Number of recent messages to keep during auto-compaction.
218    pub keep_last: usize,
219    /// Strategy used to summarize archived messages.
220    pub compact_strategy: CompactStrategy,
221    /// Optional Harn callback used when `compact_strategy` is `custom`.
222    pub custom_compactor: Option<VmValue>,
223}
224
225impl Default for AutoCompactConfig {
226    fn default() -> Self {
227        Self {
228            token_threshold: 80_000,
229            tool_output_max_chars: 20_000,
230            keep_last: 8,
231            compact_strategy: CompactStrategy::Llm,
232            custom_compactor: None,
233        }
234    }
235}
236
237/// Estimate token count from a list of JSON messages (chars / 4 heuristic).
238pub fn estimate_message_tokens(messages: &[serde_json::Value]) -> usize {
239    messages
240        .iter()
241        .map(|m| {
242            m.get("content")
243                .and_then(|c| c.as_str())
244                .map(|s| s.len())
245                .unwrap_or(0)
246        })
247        .sum::<usize>()
248        / 4
249}
250
251/// Microcompact a tool result: if it exceeds `max_chars`, keep the first and
252/// last portions with a snip marker in between.
253pub fn microcompact_tool_output(output: &str, max_chars: usize) -> String {
254    if output.len() <= max_chars || max_chars < 200 {
255        return output.to_string();
256    }
257    let diagnostic_lines = output
258        .lines()
259        .filter(|line| {
260            let trimmed = line.trim();
261            let lower = trimmed.to_lowercase();
262            // file:line pattern (e.g. "src/main.rs:42:" or "foo.go:10:5:")
263            let has_file_line = {
264                let bytes = trimmed.as_bytes();
265                let mut i = 0;
266                let mut found_colon = false;
267                while i < bytes.len() {
268                    if bytes[i] == b':' {
269                        found_colon = true;
270                        break;
271                    }
272                    i += 1;
273                }
274                found_colon && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit()
275            };
276            // keyword-based diagnostics
277            let has_keyword = trimmed.contains("error")
278                || trimmed.contains("FAIL")
279                || trimmed.contains("panic")
280                || trimmed.contains("undefined")
281                || trimmed.contains("expected")
282                || trimmed.contains("got")
283                || lower.contains("cannot find")
284                || lower.contains("not found")
285                || lower.contains("no such")
286                || lower.contains("unresolved")
287                || lower.contains("missing")
288                || lower.contains("declared but not used")
289                || lower.contains("unused")
290                || lower.contains("mismatch");
291            let positional = lower.contains(" error ")
292                || lower.starts_with("error:")
293                || lower.starts_with("fail")
294                || lower.contains("panic:");
295            (has_file_line && has_keyword) || positional
296        })
297        .take(32)
298        .collect::<Vec<_>>();
299    if !diagnostic_lines.is_empty() {
300        let diagnostics = diagnostic_lines.join("\n");
301        let budget = max_chars.saturating_sub(diagnostics.len() + 64);
302        let keep = budget / 2;
303        if keep >= 80 && output.len() > keep * 2 {
304            let head_end = output.floor_char_boundary(keep);
305            let tail_start = output.ceil_char_boundary(output.len() - keep);
306            let head = &output[..head_end];
307            let tail = &output[tail_start..];
308            return format!(
309                "{head}\n\n[diagnostic lines preserved]\n{diagnostics}\n\n[... output compacted ...]\n\n{tail}"
310            );
311        }
312    }
313    let keep = max_chars / 2;
314    let head_end = output.floor_char_boundary(keep);
315    let tail_start = output.ceil_char_boundary(output.len() - keep);
316    let head = &output[..head_end];
317    let tail = &output[tail_start..];
318    let snipped = output.len() - max_chars;
319    format!("{head}\n\n[... {snipped} characters snipped ...]\n\n{tail}")
320}
321
322fn format_compaction_messages(messages: &[serde_json::Value]) -> String {
323    messages
324        .iter()
325        .map(|msg| {
326            let role = msg
327                .get("role")
328                .and_then(|v| v.as_str())
329                .unwrap_or("user")
330                .to_uppercase();
331            let content = msg
332                .get("content")
333                .and_then(|v| v.as_str())
334                .unwrap_or_default();
335            format!("{role}: {content}")
336        })
337        .collect::<Vec<_>>()
338        .join("\n")
339}
340
341fn truncate_compaction_summary(
342    old_messages: &[serde_json::Value],
343    archived_count: usize,
344) -> String {
345    let summary_parts: Vec<String> = old_messages
346        .iter()
347        .filter_map(|m| {
348            let role = m.get("role")?.as_str()?;
349            let content = m.get("content")?.as_str()?;
350            if content.is_empty() {
351                return None;
352            }
353            let truncated = if content.len() > 500 {
354                format!("{}...", &content[..content.floor_char_boundary(500)])
355            } else {
356                content.to_string()
357            };
358            Some(format!("[{role}] {truncated}"))
359        })
360        .take(15)
361        .collect();
362    format!(
363        "[auto-compacted {archived_count} older messages]\n{}{}",
364        summary_parts.join("\n"),
365        if archived_count > 15 {
366            format!("\n... and {} more", archived_count - 15)
367        } else {
368            String::new()
369        }
370    )
371}
372
373fn compact_summary_text_from_value(value: &VmValue) -> Result<String, VmError> {
374    if let Some(map) = value.as_dict() {
375        if let Some(summary) = map.get("summary").or_else(|| map.get("text")) {
376            return Ok(summary.display());
377        }
378    }
379    match value {
380        VmValue::String(text) => Ok(text.to_string()),
381        VmValue::Nil => Ok(String::new()),
382        _ => serde_json::to_string_pretty(&vm_value_to_json(value))
383            .map_err(|e| VmError::Runtime(format!("custom compactor encode error: {e}"))),
384    }
385}
386
387async fn llm_compaction_summary(
388    old_messages: &[serde_json::Value],
389    archived_count: usize,
390    llm_opts: &crate::llm::api::LlmCallOptions,
391) -> Result<String, VmError> {
392    let mut compact_opts = llm_opts.clone();
393    let formatted = format_compaction_messages(old_messages);
394    compact_opts.system = None;
395    compact_opts.transcript_id = None;
396    compact_opts.transcript_summary = None;
397    compact_opts.transcript_metadata = None;
398    compact_opts.native_tools = None;
399    compact_opts.tool_choice = None;
400    compact_opts.response_format = None;
401    compact_opts.json_schema = None;
402    compact_opts.messages = vec![serde_json::json!({
403        "role": "user",
404        "content": format!(
405            "Summarize these archived conversation messages for a follow-on coding agent. Preserve goals, constraints, decisions, completed tool work, unresolved issues, and next actions. Output only the summary text.\n\nArchived message count: {archived_count}\n\nConversation:\n{formatted}"
406        ),
407    })];
408    let result = vm_call_llm_full(&compact_opts).await?;
409    let summary = result.text.trim();
410    if summary.is_empty() {
411        Ok(truncate_compaction_summary(old_messages, archived_count))
412    } else {
413        Ok(format!(
414            "[auto-compacted {archived_count} older messages]\n{summary}"
415        ))
416    }
417}
418
419async fn custom_compaction_summary(
420    old_messages: &[serde_json::Value],
421    archived_count: usize,
422    callback: &VmValue,
423) -> Result<String, VmError> {
424    let Some(VmValue::Closure(closure)) = Some(callback.clone()) else {
425        return Err(VmError::Runtime(
426            "compact_callback must be a closure when compact_strategy is 'custom'".to_string(),
427        ));
428    };
429    let mut vm = crate::vm::take_async_builtin_child_vm().ok_or_else(|| {
430        VmError::Runtime(
431            "custom transcript compaction requires an async builtin VM context".to_string(),
432        )
433    })?;
434    let messages_vm = VmValue::List(Rc::new(
435        old_messages
436            .iter()
437            .map(crate::stdlib::json_to_vm_value)
438            .collect(),
439    ));
440    let result = vm.call_closure_pub(&closure, &[messages_vm], &[]).await;
441    crate::vm::restore_async_builtin_child_vm(vm);
442    let summary = compact_summary_text_from_value(&result?)?;
443    if summary.trim().is_empty() {
444        Ok(truncate_compaction_summary(old_messages, archived_count))
445    } else {
446        Ok(format!(
447            "[auto-compacted {archived_count} older messages]\n{summary}"
448        ))
449    }
450}
451
452/// Auto-compact a message list in place: summarize older messages into a
453/// note and keep the most recent `keep_last` messages.
454pub(crate) async fn auto_compact_messages(
455    messages: &mut Vec<serde_json::Value>,
456    config: &AutoCompactConfig,
457    llm_opts: Option<&crate::llm::api::LlmCallOptions>,
458) -> Result<Option<String>, VmError> {
459    if messages.len() <= config.keep_last {
460        return Ok(None);
461    }
462    let split_at = messages.len().saturating_sub(config.keep_last);
463    let old_messages: Vec<_> = messages.drain(..split_at).collect();
464    let archived_count = old_messages.len();
465    let summary = match config.compact_strategy {
466        CompactStrategy::Truncate => truncate_compaction_summary(&old_messages, archived_count),
467        CompactStrategy::Llm => {
468            llm_compaction_summary(
469                &old_messages,
470                archived_count,
471                llm_opts.ok_or_else(|| {
472                    VmError::Runtime(
473                        "LLM transcript compaction requires active LLM call options".to_string(),
474                    )
475                })?,
476            )
477            .await?
478        }
479        CompactStrategy::Custom => {
480            custom_compaction_summary(
481                &old_messages,
482                archived_count,
483                config.custom_compactor.as_ref().ok_or_else(|| {
484                    VmError::Runtime(
485                        "compact_callback is required when compact_strategy is 'custom'"
486                            .to_string(),
487                    )
488                })?,
489            )
490            .await?
491        }
492    };
493    messages.insert(
494        0,
495        serde_json::json!({
496            "role": "user",
497            "content": summary,
498        }),
499    );
500    Ok(Some(summary))
501}
502
503// ── Adaptive context assembly ─────────────────────────────────────────
504
505/// Snip an artifact's text to fit within a token budget.
506pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
507    let max_chars = max_tokens * 4;
508    if let Some(ref text) = artifact.text {
509        if text.len() > max_chars && max_chars >= 200 {
510            artifact.text = Some(microcompact_tool_output(text, max_chars));
511            artifact.estimated_tokens = Some(max_tokens);
512        }
513    }
514}
515
516/// Deduplicate artifacts by removing those with identical text content,
517/// keeping the one with higher priority.
518pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
519    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
520    artifacts.retain(|artifact| {
521        let text = artifact.text.as_deref().unwrap_or("");
522        if text.is_empty() {
523            return true;
524        }
525        // Simple hash for dedup
526        let hash = {
527            use std::hash::{Hash, Hasher};
528            let mut hasher = std::collections::hash_map::DefaultHasher::new();
529            text.hash(&mut hasher);
530            hasher.finish()
531        };
532        seen_hashes.insert(hash)
533    });
534}
535
536/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
537/// then delegate to the standard `select_artifacts`.
538pub fn select_artifacts_adaptive(
539    mut artifacts: Vec<ArtifactRecord>,
540    policy: &ContextPolicy,
541) -> Vec<ArtifactRecord> {
542    // Phase 1: deduplicate
543    dedup_artifacts(&mut artifacts);
544
545    // Phase 2: microcompact oversized artifacts relative to budget.
546    // Cap individual artifacts to a fraction of the total budget, but don't
547    // let the per-artifact cap exceed the total budget (avoid overrun).
548    if let Some(max_tokens) = policy.max_tokens {
549        let count = artifacts.len().max(1);
550        let per_artifact_budget = max_tokens / count;
551        // Floor of 500 tokens, but never more than total budget
552        let cap = per_artifact_budget.max(500).min(max_tokens);
553        for artifact in &mut artifacts {
554            let est = artifact.estimated_tokens.unwrap_or(0);
555            if est > cap * 2 {
556                microcompact_artifact(artifact, cap);
557            }
558        }
559    }
560
561    // Phase 3: standard selection with budget
562    select_artifacts(artifacts, policy)
563}
564
565// ── Per-agent policy with argument patterns ───────────────────────────
566
567/// Extended policy that supports argument-level constraints.
568#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
569#[serde(default)]
570pub struct ToolArgConstraint {
571    /// Tool name to constrain.
572    pub tool: String,
573    /// Glob patterns that the first string argument must match.
574    /// If empty, no argument constraint is applied.
575    pub arg_patterns: Vec<String>,
576}
577
578/// Check if a tool call satisfies argument constraints in the policy.
579pub fn enforce_tool_arg_constraints(
580    policy: &CapabilityPolicy,
581    tool_name: &str,
582    args: &serde_json::Value,
583) -> Result<(), VmError> {
584    for constraint in &policy.tool_arg_constraints {
585        if !glob_match(&constraint.tool, tool_name) {
586            continue;
587        }
588        if constraint.arg_patterns.is_empty() {
589            continue;
590        }
591        // Extract the first string-like argument for pattern matching
592        let first_arg = args
593            .as_object()
594            .and_then(|o| o.values().next())
595            .and_then(|v| v.as_str())
596            .or_else(|| args.as_str())
597            .unwrap_or("");
598        let matches = constraint
599            .arg_patterns
600            .iter()
601            .any(|pattern| glob_match(pattern, first_arg));
602        if !matches {
603            return reject_policy(format!(
604                "tool '{tool_name}' argument '{first_arg}' does not match allowed patterns: {:?}",
605                constraint.arg_patterns
606            ));
607        }
608    }
609    Ok(())
610}
611
612fn normalize_artifact_kind(kind: &str) -> String {
613    match kind {
614        "resource"
615        | "workspace_file"
616        | "editor_selection"
617        | "workspace_snapshot"
618        | "transcript_summary"
619        | "summary"
620        | "plan"
621        | "diff"
622        | "git_diff"
623        | "patch"
624        | "patch_set"
625        | "patch_proposal"
626        | "diff_review"
627        | "review_decision"
628        | "verification_bundle"
629        | "apply_intent"
630        | "verification_result"
631        | "test_result"
632        | "command_result"
633        | "provider_payload"
634        | "worker_result"
635        | "worker_notification"
636        | "artifact" => kind.to_string(),
637        "file" => "workspace_file".to_string(),
638        "transcript" => "transcript_summary".to_string(),
639        "verification" => "verification_result".to_string(),
640        "test" => "test_result".to_string(),
641        other if other.trim().is_empty() => "artifact".to_string(),
642        other => other.to_string(),
643    }
644}
645
646fn default_artifact_priority(kind: &str) -> i64 {
647    match kind {
648        "verification_result" | "test_result" => 100,
649        "verification_bundle" => 95,
650        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
651        | "review_decision" | "apply_intent" => 90,
652        "plan" => 80,
653        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
654        "summary" | "transcript_summary" => 60,
655        "command_result" => 50,
656        _ => 40,
657    }
658}
659
660fn freshness_rank(value: Option<&str>) -> i64 {
661    match value.unwrap_or_default() {
662        "fresh" | "live" => 3,
663        "recent" => 2,
664        "stale" => 0,
665        _ => 1,
666    }
667}
668
669#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
670#[serde(default)]
671pub struct ToolRuntimePolicyMetadata {
672    pub capabilities: BTreeMap<String, Vec<String>>,
673    pub side_effect_level: Option<String>,
674    pub path_params: Vec<String>,
675    pub mutation_classification: Option<String>,
676}
677
678#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
679#[serde(default)]
680pub struct CapabilityPolicy {
681    pub tools: Vec<String>,
682    pub capabilities: BTreeMap<String, Vec<String>>,
683    pub workspace_roots: Vec<String>,
684    pub side_effect_level: Option<String>,
685    pub recursion_limit: Option<usize>,
686    /// Argument-level constraints for specific tools.
687    #[serde(default)]
688    pub tool_arg_constraints: Vec<ToolArgConstraint>,
689    #[serde(default)]
690    pub tool_metadata: BTreeMap<String, ToolRuntimePolicyMetadata>,
691}
692
693impl CapabilityPolicy {
694    pub fn intersect(&self, requested: &CapabilityPolicy) -> Result<CapabilityPolicy, String> {
695        let side_effect_level = match (&self.side_effect_level, &requested.side_effect_level) {
696            (Some(a), Some(b)) => Some(min_side_effect(a, b).to_string()),
697            (Some(a), None) => Some(a.clone()),
698            (None, Some(b)) => Some(b.clone()),
699            (None, None) => None,
700        };
701
702        if !self.tools.is_empty() {
703            let denied: Vec<String> = requested
704                .tools
705                .iter()
706                .filter(|tool| !self.tools.contains(*tool))
707                .cloned()
708                .collect();
709            if !denied.is_empty() {
710                return Err(format!(
711                    "requested tools exceed host ceiling: {}",
712                    denied.join(", ")
713                ));
714            }
715        }
716
717        for (capability, requested_ops) in &requested.capabilities {
718            if let Some(allowed_ops) = self.capabilities.get(capability) {
719                let denied: Vec<String> = requested_ops
720                    .iter()
721                    .filter(|op| !allowed_ops.contains(*op))
722                    .cloned()
723                    .collect();
724                if !denied.is_empty() {
725                    return Err(format!(
726                        "requested capability operations exceed host ceiling: {}.{}",
727                        capability,
728                        denied.join(",")
729                    ));
730                }
731            } else if !self.capabilities.is_empty() {
732                return Err(format!(
733                    "requested capability exceeds host ceiling: {capability}"
734                ));
735            }
736        }
737
738        let tools = if self.tools.is_empty() {
739            requested.tools.clone()
740        } else if requested.tools.is_empty() {
741            self.tools.clone()
742        } else {
743            requested
744                .tools
745                .iter()
746                .filter(|tool| self.tools.contains(*tool))
747                .cloned()
748                .collect()
749        };
750
751        let capabilities = if self.capabilities.is_empty() {
752            requested.capabilities.clone()
753        } else if requested.capabilities.is_empty() {
754            self.capabilities.clone()
755        } else {
756            requested
757                .capabilities
758                .iter()
759                .filter_map(|(capability, requested_ops)| {
760                    self.capabilities.get(capability).map(|allowed_ops| {
761                        (
762                            capability.clone(),
763                            requested_ops
764                                .iter()
765                                .filter(|op| allowed_ops.contains(*op))
766                                .cloned()
767                                .collect::<Vec<_>>(),
768                        )
769                    })
770                })
771                .collect()
772        };
773
774        let workspace_roots = if self.workspace_roots.is_empty() {
775            requested.workspace_roots.clone()
776        } else if requested.workspace_roots.is_empty() {
777            self.workspace_roots.clone()
778        } else {
779            requested
780                .workspace_roots
781                .iter()
782                .filter(|root| self.workspace_roots.contains(*root))
783                .cloned()
784                .collect()
785        };
786
787        let recursion_limit = match (self.recursion_limit, requested.recursion_limit) {
788            (Some(a), Some(b)) => Some(a.min(b)),
789            (Some(a), None) => Some(a),
790            (None, Some(b)) => Some(b),
791            (None, None) => None,
792        };
793
794        // Merge arg constraints from both sides
795        let mut tool_arg_constraints = self.tool_arg_constraints.clone();
796        tool_arg_constraints.extend(requested.tool_arg_constraints.clone());
797
798        let tool_metadata = tools
799            .iter()
800            .filter_map(|tool| {
801                requested
802                    .tool_metadata
803                    .get(tool)
804                    .or_else(|| self.tool_metadata.get(tool))
805                    .cloned()
806                    .map(|metadata| (tool.clone(), metadata))
807            })
808            .collect();
809
810        Ok(CapabilityPolicy {
811            tools,
812            capabilities,
813            workspace_roots,
814            side_effect_level,
815            recursion_limit,
816            tool_arg_constraints,
817            tool_metadata,
818        })
819    }
820}
821
822fn min_side_effect<'a>(a: &'a str, b: &'a str) -> &'a str {
823    fn rank(v: &str) -> usize {
824        match v {
825            "none" => 0,
826            "read_only" => 1,
827            "workspace_write" => 2,
828            "process_exec" => 3,
829            "network" => 4,
830            _ => 5,
831        }
832    }
833    if rank(a) <= rank(b) {
834        a
835    } else {
836        b
837    }
838}
839
840#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
841#[serde(default)]
842pub struct ModelPolicy {
843    pub provider: Option<String>,
844    pub model: Option<String>,
845    pub model_tier: Option<String>,
846    pub temperature: Option<f64>,
847    pub max_tokens: Option<i64>,
848}
849
850#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
851#[serde(default)]
852pub struct TranscriptPolicy {
853    pub mode: Option<String>,
854    pub visibility: Option<String>,
855    pub summarize: bool,
856    pub compact: bool,
857    pub keep_last: Option<usize>,
858}
859
860#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
861#[serde(default)]
862pub struct ContextPolicy {
863    pub max_artifacts: Option<usize>,
864    pub max_tokens: Option<usize>,
865    pub reserve_tokens: Option<usize>,
866    pub include_kinds: Vec<String>,
867    pub exclude_kinds: Vec<String>,
868    pub prioritize_kinds: Vec<String>,
869    pub pinned_ids: Vec<String>,
870    pub include_stages: Vec<String>,
871    pub prefer_recent: bool,
872    pub prefer_fresh: bool,
873    pub render: Option<String>,
874}
875
876#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
877#[serde(default)]
878pub struct RetryPolicy {
879    pub max_attempts: usize,
880    pub verify: bool,
881    pub repair: bool,
882}
883
884#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
885#[serde(default)]
886pub struct StageContract {
887    pub input_kinds: Vec<String>,
888    pub output_kinds: Vec<String>,
889    pub min_inputs: Option<usize>,
890    pub max_inputs: Option<usize>,
891    pub require_transcript: bool,
892    pub schema: Option<serde_json::Value>,
893}
894
895#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
896#[serde(default)]
897pub struct BranchSemantics {
898    pub success: Option<String>,
899    pub failure: Option<String>,
900    pub verify_pass: Option<String>,
901    pub verify_fail: Option<String>,
902    pub condition_true: Option<String>,
903    pub condition_false: Option<String>,
904    pub loop_continue: Option<String>,
905    pub loop_exit: Option<String>,
906    pub escalation: Option<String>,
907}
908
909#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
910#[serde(default)]
911pub struct MapPolicy {
912    pub items: Vec<serde_json::Value>,
913    pub item_artifact_kind: Option<String>,
914    pub output_kind: Option<String>,
915    pub max_items: Option<usize>,
916}
917
918#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
919#[serde(default)]
920pub struct JoinPolicy {
921    pub strategy: String,
922    pub require_all_inputs: bool,
923    pub min_completed: Option<usize>,
924}
925
926#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
927#[serde(default)]
928pub struct ReducePolicy {
929    pub strategy: String,
930    pub separator: Option<String>,
931    pub output_kind: Option<String>,
932}
933
934#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
935#[serde(default)]
936pub struct EscalationPolicy {
937    pub level: Option<String>,
938    pub queue: Option<String>,
939    pub reason: Option<String>,
940}
941
942#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
943#[serde(default)]
944pub struct ArtifactRecord {
945    #[serde(rename = "_type")]
946    pub type_name: String,
947    pub id: String,
948    pub kind: String,
949    pub title: Option<String>,
950    pub text: Option<String>,
951    pub data: Option<serde_json::Value>,
952    pub source: Option<String>,
953    pub created_at: String,
954    pub freshness: Option<String>,
955    pub priority: Option<i64>,
956    pub lineage: Vec<String>,
957    pub relevance: Option<f64>,
958    pub estimated_tokens: Option<usize>,
959    pub stage: Option<String>,
960    pub metadata: BTreeMap<String, serde_json::Value>,
961}
962
963impl ArtifactRecord {
964    pub fn normalize(mut self) -> Self {
965        if self.type_name.is_empty() {
966            self.type_name = "artifact".to_string();
967        }
968        if self.id.is_empty() {
969            self.id = new_id("artifact");
970        }
971        if self.created_at.is_empty() {
972            self.created_at = now_rfc3339();
973        }
974        if self.kind.is_empty() {
975            self.kind = "artifact".to_string();
976        }
977        self.kind = normalize_artifact_kind(&self.kind);
978        if self.estimated_tokens.is_none() {
979            self.estimated_tokens = self
980                .text
981                .as_ref()
982                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
983        }
984        if self.priority.is_none() {
985            self.priority = Some(default_artifact_priority(&self.kind));
986        }
987        self
988    }
989}
990
991#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
992#[serde(default)]
993pub struct WorkflowNode {
994    pub id: Option<String>,
995    pub kind: String,
996    pub mode: Option<String>,
997    pub prompt: Option<String>,
998    pub system: Option<String>,
999    pub task_label: Option<String>,
1000    pub done_sentinel: Option<String>,
1001    pub tools: serde_json::Value,
1002    pub model_policy: ModelPolicy,
1003    pub transcript_policy: TranscriptPolicy,
1004    pub context_policy: ContextPolicy,
1005    pub retry_policy: RetryPolicy,
1006    pub capability_policy: CapabilityPolicy,
1007    pub input_contract: StageContract,
1008    pub output_contract: StageContract,
1009    pub branch_semantics: BranchSemantics,
1010    pub map_policy: MapPolicy,
1011    pub join_policy: JoinPolicy,
1012    pub reduce_policy: ReducePolicy,
1013    pub escalation_policy: EscalationPolicy,
1014    pub verify: Option<serde_json::Value>,
1015    pub metadata: BTreeMap<String, serde_json::Value>,
1016}
1017
1018pub fn workflow_tool_names(value: &serde_json::Value) -> Vec<String> {
1019    match value {
1020        serde_json::Value::Null => Vec::new(),
1021        serde_json::Value::Array(items) => items
1022            .iter()
1023            .filter_map(|item| match item {
1024                serde_json::Value::Object(map) => map
1025                    .get("name")
1026                    .and_then(|value| value.as_str())
1027                    .filter(|name| !name.is_empty())
1028                    .map(|name| name.to_string()),
1029                _ => None,
1030            })
1031            .collect(),
1032        serde_json::Value::Object(map) => {
1033            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1034                return map
1035                    .get("tools")
1036                    .map(workflow_tool_names)
1037                    .unwrap_or_default();
1038            }
1039            map.get("name")
1040                .and_then(|value| value.as_str())
1041                .filter(|name| !name.is_empty())
1042                .map(|name| vec![name.to_string()])
1043                .unwrap_or_default()
1044        }
1045        _ => Vec::new(),
1046    }
1047}
1048
1049fn max_side_effect_level(levels: impl Iterator<Item = String>) -> Option<String> {
1050    fn rank(v: &str) -> usize {
1051        match v {
1052            "none" => 0,
1053            "read_only" => 1,
1054            "workspace_write" => 2,
1055            "process_exec" => 3,
1056            "network" => 4,
1057            _ => 5,
1058        }
1059    }
1060    levels.max_by_key(|level| rank(level))
1061}
1062
1063fn parse_tool_runtime_policy(
1064    map: &serde_json::Map<String, serde_json::Value>,
1065) -> ToolRuntimePolicyMetadata {
1066    let Some(policy) = map.get("policy").and_then(|value| value.as_object()) else {
1067        return ToolRuntimePolicyMetadata::default();
1068    };
1069
1070    let capabilities = policy
1071        .get("capabilities")
1072        .and_then(|value| value.as_object())
1073        .map(|caps| {
1074            caps.iter()
1075                .map(|(capability, ops)| {
1076                    let values = ops
1077                        .as_array()
1078                        .map(|items| {
1079                            items
1080                                .iter()
1081                                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1082                                .collect::<Vec<_>>()
1083                        })
1084                        .unwrap_or_default();
1085                    (capability.clone(), values)
1086                })
1087                .collect::<BTreeMap<_, _>>()
1088        })
1089        .unwrap_or_default();
1090
1091    let path_params = policy
1092        .get("path_params")
1093        .and_then(|value| value.as_array())
1094        .map(|items| {
1095            items
1096                .iter()
1097                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1098                .collect::<Vec<_>>()
1099        })
1100        .unwrap_or_default();
1101
1102    ToolRuntimePolicyMetadata {
1103        capabilities,
1104        side_effect_level: policy
1105            .get("side_effect_level")
1106            .and_then(|value| value.as_str())
1107            .map(|s| s.to_string()),
1108        path_params,
1109        mutation_classification: policy
1110            .get("mutation_classification")
1111            .and_then(|value| value.as_str())
1112            .map(|s| s.to_string()),
1113    }
1114}
1115
1116pub fn workflow_tool_metadata(
1117    value: &serde_json::Value,
1118) -> BTreeMap<String, ToolRuntimePolicyMetadata> {
1119    match value {
1120        serde_json::Value::Null => BTreeMap::new(),
1121        serde_json::Value::Array(items) => items
1122            .iter()
1123            .filter_map(|item| match item {
1124                serde_json::Value::Object(map) => map
1125                    .get("name")
1126                    .and_then(|value| value.as_str())
1127                    .filter(|name| !name.is_empty())
1128                    .map(|name| (name.to_string(), parse_tool_runtime_policy(map))),
1129                _ => None,
1130            })
1131            .collect(),
1132        serde_json::Value::Object(map) => {
1133            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1134                return map
1135                    .get("tools")
1136                    .map(workflow_tool_metadata)
1137                    .unwrap_or_default();
1138            }
1139            map.get("name")
1140                .and_then(|value| value.as_str())
1141                .filter(|name| !name.is_empty())
1142                .map(|name| {
1143                    let mut metadata = BTreeMap::new();
1144                    metadata.insert(name.to_string(), parse_tool_runtime_policy(map));
1145                    metadata
1146                })
1147                .unwrap_or_default()
1148        }
1149        _ => BTreeMap::new(),
1150    }
1151}
1152
1153pub fn workflow_tool_policy_from_tools(value: &serde_json::Value) -> CapabilityPolicy {
1154    let tools = workflow_tool_names(value);
1155    let tool_metadata = workflow_tool_metadata(value);
1156    let mut capabilities: BTreeMap<String, Vec<String>> = BTreeMap::new();
1157    for metadata in tool_metadata.values() {
1158        for (capability, ops) in &metadata.capabilities {
1159            let entry = capabilities.entry(capability.clone()).or_default();
1160            for op in ops {
1161                if !entry.contains(op) {
1162                    entry.push(op.clone());
1163                }
1164            }
1165            entry.sort();
1166        }
1167    }
1168    let side_effect_level = max_side_effect_level(
1169        tool_metadata
1170            .values()
1171            .filter_map(|metadata| metadata.side_effect_level.clone()),
1172    );
1173    CapabilityPolicy {
1174        tools,
1175        capabilities,
1176        workspace_roots: Vec::new(),
1177        side_effect_level,
1178        recursion_limit: None,
1179        tool_arg_constraints: Vec::new(),
1180        tool_metadata,
1181    }
1182}
1183
1184#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1185#[serde(default)]
1186pub struct WorkflowEdge {
1187    pub from: String,
1188    pub to: String,
1189    pub branch: Option<String>,
1190    pub label: Option<String>,
1191}
1192
1193#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1194#[serde(default)]
1195pub struct WorkflowGraph {
1196    #[serde(rename = "_type")]
1197    pub type_name: String,
1198    pub id: String,
1199    pub name: Option<String>,
1200    pub version: usize,
1201    pub entry: String,
1202    pub nodes: BTreeMap<String, WorkflowNode>,
1203    pub edges: Vec<WorkflowEdge>,
1204    pub capability_policy: CapabilityPolicy,
1205    pub metadata: BTreeMap<String, serde_json::Value>,
1206    pub audit_log: Vec<WorkflowAuditEntry>,
1207}
1208
1209#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1210#[serde(default)]
1211pub struct WorkflowAuditEntry {
1212    pub id: String,
1213    pub op: String,
1214    pub node_id: Option<String>,
1215    pub timestamp: String,
1216    pub reason: Option<String>,
1217    pub metadata: BTreeMap<String, serde_json::Value>,
1218}
1219
1220#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1221#[serde(default)]
1222pub struct LlmUsageRecord {
1223    pub input_tokens: i64,
1224    pub output_tokens: i64,
1225    pub total_duration_ms: i64,
1226    pub call_count: i64,
1227    pub total_cost: f64,
1228    pub models: Vec<String>,
1229}
1230
1231#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1232#[serde(default)]
1233pub struct RunStageRecord {
1234    pub id: String,
1235    pub node_id: String,
1236    pub kind: String,
1237    pub status: String,
1238    pub outcome: String,
1239    pub branch: Option<String>,
1240    pub started_at: String,
1241    pub finished_at: Option<String>,
1242    pub visible_text: Option<String>,
1243    pub private_reasoning: Option<String>,
1244    pub transcript: Option<serde_json::Value>,
1245    pub verification: Option<serde_json::Value>,
1246    pub usage: Option<LlmUsageRecord>,
1247    pub artifacts: Vec<ArtifactRecord>,
1248    pub consumed_artifact_ids: Vec<String>,
1249    pub produced_artifact_ids: Vec<String>,
1250    pub attempts: Vec<RunStageAttemptRecord>,
1251    pub metadata: BTreeMap<String, serde_json::Value>,
1252}
1253
1254#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1255#[serde(default)]
1256pub struct RunStageAttemptRecord {
1257    pub attempt: usize,
1258    pub status: String,
1259    pub outcome: String,
1260    pub branch: Option<String>,
1261    pub error: Option<String>,
1262    pub verification: Option<serde_json::Value>,
1263    pub started_at: String,
1264    pub finished_at: Option<String>,
1265}
1266
1267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1268#[serde(default)]
1269pub struct RunTransitionRecord {
1270    pub id: String,
1271    pub from_stage_id: Option<String>,
1272    pub from_node_id: Option<String>,
1273    pub to_node_id: String,
1274    pub branch: Option<String>,
1275    pub timestamp: String,
1276    pub consumed_artifact_ids: Vec<String>,
1277    pub produced_artifact_ids: Vec<String>,
1278}
1279
1280#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1281#[serde(default)]
1282pub struct RunCheckpointRecord {
1283    pub id: String,
1284    pub ready_nodes: Vec<String>,
1285    pub completed_nodes: Vec<String>,
1286    pub last_stage_id: Option<String>,
1287    pub persisted_at: String,
1288    pub reason: String,
1289}
1290
1291#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1292#[serde(default)]
1293pub struct ReplayFixture {
1294    #[serde(rename = "_type")]
1295    pub type_name: String,
1296    pub id: String,
1297    pub source_run_id: String,
1298    pub workflow_id: String,
1299    pub workflow_name: Option<String>,
1300    pub created_at: String,
1301    pub expected_status: String,
1302    pub stage_assertions: Vec<ReplayStageAssertion>,
1303}
1304
1305#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1306#[serde(default)]
1307pub struct ReplayStageAssertion {
1308    pub node_id: String,
1309    pub expected_status: String,
1310    pub expected_outcome: String,
1311    pub expected_branch: Option<String>,
1312    pub required_artifact_kinds: Vec<String>,
1313    pub visible_text_contains: Option<String>,
1314}
1315
1316#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1317#[serde(default)]
1318pub struct ReplayEvalReport {
1319    pub pass: bool,
1320    pub failures: Vec<String>,
1321    pub stage_count: usize,
1322}
1323
1324#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1325#[serde(default)]
1326pub struct ReplayEvalCaseReport {
1327    pub run_id: String,
1328    pub workflow_id: String,
1329    pub label: Option<String>,
1330    pub pass: bool,
1331    pub failures: Vec<String>,
1332    pub stage_count: usize,
1333    pub source_path: Option<String>,
1334    pub comparison: Option<RunDiffReport>,
1335}
1336
1337#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1338#[serde(default)]
1339pub struct ReplayEvalSuiteReport {
1340    pub pass: bool,
1341    pub total: usize,
1342    pub passed: usize,
1343    pub failed: usize,
1344    pub cases: Vec<ReplayEvalCaseReport>,
1345}
1346
1347#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1348#[serde(default)]
1349pub struct RunStageDiffRecord {
1350    pub node_id: String,
1351    pub change: String,
1352    pub details: Vec<String>,
1353}
1354
1355#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1356#[serde(default)]
1357pub struct RunDiffReport {
1358    pub left_run_id: String,
1359    pub right_run_id: String,
1360    pub identical: bool,
1361    pub status_changed: bool,
1362    pub left_status: String,
1363    pub right_status: String,
1364    pub stage_diffs: Vec<RunStageDiffRecord>,
1365    pub transition_count_delta: isize,
1366    pub artifact_count_delta: isize,
1367    pub checkpoint_count_delta: isize,
1368}
1369
1370#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1371#[serde(default)]
1372pub struct EvalSuiteManifest {
1373    #[serde(rename = "_type")]
1374    pub type_name: String,
1375    pub id: String,
1376    pub name: Option<String>,
1377    pub base_dir: Option<String>,
1378    pub cases: Vec<EvalSuiteCase>,
1379}
1380
1381#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1382#[serde(default)]
1383pub struct EvalSuiteCase {
1384    pub label: Option<String>,
1385    pub run_path: String,
1386    pub fixture_path: Option<String>,
1387    pub compare_to: Option<String>,
1388}
1389
1390#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1391#[serde(default)]
1392pub struct RunRecord {
1393    #[serde(rename = "_type")]
1394    pub type_name: String,
1395    pub id: String,
1396    pub workflow_id: String,
1397    pub workflow_name: Option<String>,
1398    pub task: String,
1399    pub status: String,
1400    pub started_at: String,
1401    pub finished_at: Option<String>,
1402    pub parent_run_id: Option<String>,
1403    pub root_run_id: Option<String>,
1404    pub stages: Vec<RunStageRecord>,
1405    pub transitions: Vec<RunTransitionRecord>,
1406    pub checkpoints: Vec<RunCheckpointRecord>,
1407    pub pending_nodes: Vec<String>,
1408    pub completed_nodes: Vec<String>,
1409    pub child_runs: Vec<RunChildRecord>,
1410    pub artifacts: Vec<ArtifactRecord>,
1411    pub policy: CapabilityPolicy,
1412    pub execution: Option<RunExecutionRecord>,
1413    pub transcript: Option<serde_json::Value>,
1414    pub usage: Option<LlmUsageRecord>,
1415    pub replay_fixture: Option<ReplayFixture>,
1416    pub trace_spans: Vec<RunTraceSpanRecord>,
1417    pub metadata: BTreeMap<String, serde_json::Value>,
1418    pub persisted_path: Option<String>,
1419}
1420
1421#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1422#[serde(default)]
1423pub struct RunTraceSpanRecord {
1424    pub span_id: u64,
1425    pub parent_id: Option<u64>,
1426    pub kind: String,
1427    pub name: String,
1428    pub start_ms: u64,
1429    pub duration_ms: u64,
1430    pub metadata: BTreeMap<String, serde_json::Value>,
1431}
1432
1433#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1434#[serde(default)]
1435pub struct RunChildRecord {
1436    pub worker_id: String,
1437    pub worker_name: String,
1438    pub parent_stage_id: Option<String>,
1439    pub session_id: Option<String>,
1440    pub parent_session_id: Option<String>,
1441    pub mutation_scope: Option<String>,
1442    pub approval_mode: Option<String>,
1443    pub task: String,
1444    pub status: String,
1445    pub started_at: String,
1446    pub finished_at: Option<String>,
1447    pub run_id: Option<String>,
1448    pub run_path: Option<String>,
1449    pub snapshot_path: Option<String>,
1450    pub execution: Option<RunExecutionRecord>,
1451}
1452
1453#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1454#[serde(default)]
1455pub struct RunExecutionRecord {
1456    pub cwd: Option<String>,
1457    pub source_dir: Option<String>,
1458    pub env: BTreeMap<String, String>,
1459    pub adapter: Option<String>,
1460    pub repo_path: Option<String>,
1461    pub worktree_path: Option<String>,
1462    pub branch: Option<String>,
1463    pub base_ref: Option<String>,
1464    pub cleanup: Option<String>,
1465}
1466
1467#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1468#[serde(default)]
1469pub struct WorkflowValidationReport {
1470    pub valid: bool,
1471    pub errors: Vec<String>,
1472    pub warnings: Vec<String>,
1473    pub reachable_nodes: Vec<String>,
1474}
1475
1476fn parse_json_payload<T: for<'de> Deserialize<'de>>(
1477    json: serde_json::Value,
1478    label: &str,
1479) -> Result<T, VmError> {
1480    let payload = json.to_string();
1481    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1482    let mut tracker = serde_path_to_error::Track::new();
1483    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1484    T::deserialize(path_deserializer).map_err(|error| {
1485        let snippet = if payload.len() > 600 {
1486            format!("{}...", &payload[..600])
1487        } else {
1488            payload.clone()
1489        };
1490        VmError::Runtime(format!(
1491            "{label} parse error at {}: {} | payload={}",
1492            tracker.path(),
1493            error,
1494            snippet
1495        ))
1496    })
1497}
1498
1499fn parse_json_value<T: for<'de> Deserialize<'de>>(value: &VmValue) -> Result<T, VmError> {
1500    parse_json_payload(vm_value_to_json(value), "orchestration")
1501}
1502
1503pub fn parse_workflow_node_value(value: &VmValue, label: &str) -> Result<WorkflowNode, VmError> {
1504    parse_json_payload(vm_value_to_json(value), label)
1505}
1506
1507pub fn parse_workflow_node_json(
1508    json: serde_json::Value,
1509    label: &str,
1510) -> Result<WorkflowNode, VmError> {
1511    parse_json_payload(json, label)
1512}
1513
1514pub fn parse_workflow_edge_json(
1515    json: serde_json::Value,
1516    label: &str,
1517) -> Result<WorkflowEdge, VmError> {
1518    parse_json_payload(json, label)
1519}
1520
1521pub fn normalize_workflow_value(value: &VmValue) -> Result<WorkflowGraph, VmError> {
1522    let mut graph: WorkflowGraph = parse_json_value(value)?;
1523    let as_dict = value.as_dict().cloned().unwrap_or_default();
1524
1525    if graph.nodes.is_empty() {
1526        for key in ["act", "verify", "repair"] {
1527            if let Some(node_value) = as_dict.get(key) {
1528                let mut node: WorkflowNode = parse_json_value(node_value)?;
1529                let raw_node = node_value.as_dict().cloned().unwrap_or_default();
1530                node.id = Some(key.to_string());
1531                if node.kind.is_empty() {
1532                    node.kind = if key == "verify" {
1533                        "verify".to_string()
1534                    } else {
1535                        "stage".to_string()
1536                    };
1537                }
1538                if node.model_policy.provider.is_none() {
1539                    node.model_policy.provider = as_dict
1540                        .get("provider")
1541                        .map(|value| value.display())
1542                        .filter(|value| !value.is_empty());
1543                }
1544                if node.model_policy.model.is_none() {
1545                    node.model_policy.model = as_dict
1546                        .get("model")
1547                        .map(|value| value.display())
1548                        .filter(|value| !value.is_empty());
1549                }
1550                if node.model_policy.model_tier.is_none() {
1551                    node.model_policy.model_tier = as_dict
1552                        .get("model_tier")
1553                        .or_else(|| as_dict.get("tier"))
1554                        .map(|value| value.display())
1555                        .filter(|value| !value.is_empty());
1556                }
1557                if node.model_policy.temperature.is_none() {
1558                    node.model_policy.temperature = as_dict.get("temperature").and_then(|value| {
1559                        if let VmValue::Float(number) = value {
1560                            Some(*number)
1561                        } else {
1562                            value.as_int().map(|number| number as f64)
1563                        }
1564                    });
1565                }
1566                if node.model_policy.max_tokens.is_none() {
1567                    node.model_policy.max_tokens =
1568                        as_dict.get("max_tokens").and_then(|value| value.as_int());
1569                }
1570                if node.mode.is_none() {
1571                    node.mode = as_dict
1572                        .get("mode")
1573                        .map(|value| value.display())
1574                        .filter(|value| !value.is_empty());
1575                }
1576                if node.done_sentinel.is_none() {
1577                    node.done_sentinel = as_dict
1578                        .get("done_sentinel")
1579                        .map(|value| value.display())
1580                        .filter(|value| !value.is_empty());
1581                }
1582                if key == "verify"
1583                    && node.verify.is_none()
1584                    && (raw_node.contains_key("assert_text")
1585                        || raw_node.contains_key("command")
1586                        || raw_node.contains_key("expect_status")
1587                        || raw_node.contains_key("expect_text"))
1588                {
1589                    node.verify = Some(serde_json::json!({
1590                        "assert_text": raw_node.get("assert_text").map(vm_value_to_json),
1591                        "command": raw_node.get("command").map(vm_value_to_json),
1592                        "expect_status": raw_node.get("expect_status").map(vm_value_to_json),
1593                        "expect_text": raw_node.get("expect_text").map(vm_value_to_json),
1594                    }));
1595                }
1596                graph.nodes.insert(key.to_string(), node);
1597            }
1598        }
1599        if graph.entry.is_empty() && graph.nodes.contains_key("act") {
1600            graph.entry = "act".to_string();
1601        }
1602        if graph.edges.is_empty() && graph.nodes.contains_key("act") {
1603            if graph.nodes.contains_key("verify") {
1604                graph.edges.push(WorkflowEdge {
1605                    from: "act".to_string(),
1606                    to: "verify".to_string(),
1607                    branch: None,
1608                    label: None,
1609                });
1610            }
1611            if graph.nodes.contains_key("repair") {
1612                graph.edges.push(WorkflowEdge {
1613                    from: "verify".to_string(),
1614                    to: "repair".to_string(),
1615                    branch: Some("failed".to_string()),
1616                    label: None,
1617                });
1618                graph.edges.push(WorkflowEdge {
1619                    from: "repair".to_string(),
1620                    to: "verify".to_string(),
1621                    branch: Some("retry".to_string()),
1622                    label: None,
1623                });
1624            }
1625        }
1626    }
1627
1628    if graph.type_name.is_empty() {
1629        graph.type_name = "workflow_graph".to_string();
1630    }
1631    if graph.id.is_empty() {
1632        graph.id = new_id("workflow");
1633    }
1634    if graph.version == 0 {
1635        graph.version = 1;
1636    }
1637    if graph.entry.is_empty() {
1638        graph.entry = graph
1639            .nodes
1640            .keys()
1641            .next()
1642            .cloned()
1643            .unwrap_or_else(|| "act".to_string());
1644    }
1645    for (node_id, node) in &mut graph.nodes {
1646        if node.id.is_none() {
1647            node.id = Some(node_id.clone());
1648        }
1649        if node.kind.is_empty() {
1650            node.kind = "stage".to_string();
1651        }
1652        if node.join_policy.strategy.is_empty() {
1653            node.join_policy.strategy = "all".to_string();
1654        }
1655        if node.reduce_policy.strategy.is_empty() {
1656            node.reduce_policy.strategy = "concat".to_string();
1657        }
1658        if node.output_contract.output_kinds.is_empty() {
1659            node.output_contract.output_kinds = vec![match node.kind.as_str() {
1660                "verify" => "verification_result".to_string(),
1661                "reduce" => node
1662                    .reduce_policy
1663                    .output_kind
1664                    .clone()
1665                    .unwrap_or_else(|| "summary".to_string()),
1666                "map" => node
1667                    .map_policy
1668                    .output_kind
1669                    .clone()
1670                    .unwrap_or_else(|| "artifact".to_string()),
1671                "escalation" => "plan".to_string(),
1672                _ => "artifact".to_string(),
1673            }];
1674        }
1675        if node.retry_policy.max_attempts == 0 {
1676            node.retry_policy.max_attempts = 1;
1677        }
1678    }
1679    Ok(graph)
1680}
1681
1682pub fn validate_workflow(
1683    graph: &WorkflowGraph,
1684    ceiling: Option<&CapabilityPolicy>,
1685) -> WorkflowValidationReport {
1686    let mut errors = Vec::new();
1687    let mut warnings = Vec::new();
1688
1689    if !graph.nodes.contains_key(&graph.entry) {
1690        errors.push(format!("entry node does not exist: {}", graph.entry));
1691    }
1692
1693    let node_ids: BTreeSet<String> = graph.nodes.keys().cloned().collect();
1694    for edge in &graph.edges {
1695        if !node_ids.contains(&edge.from) {
1696            errors.push(format!("edge.from references unknown node: {}", edge.from));
1697        }
1698        if !node_ids.contains(&edge.to) {
1699            errors.push(format!("edge.to references unknown node: {}", edge.to));
1700        }
1701    }
1702
1703    let reachable_nodes = reachable_nodes(graph);
1704    for node_id in &node_ids {
1705        if !reachable_nodes.contains(node_id) {
1706            warnings.push(format!("node is unreachable: {node_id}"));
1707        }
1708    }
1709
1710    for (node_id, node) in &graph.nodes {
1711        let incoming = graph
1712            .edges
1713            .iter()
1714            .filter(|edge| edge.to == *node_id)
1715            .count();
1716        let outgoing: Vec<&WorkflowEdge> = graph
1717            .edges
1718            .iter()
1719            .filter(|edge| edge.from == *node_id)
1720            .collect();
1721        if let Some(min_inputs) = node.input_contract.min_inputs {
1722            if let Some(max_inputs) = node.input_contract.max_inputs {
1723                if min_inputs > max_inputs {
1724                    errors.push(format!(
1725                        "node {node_id}: input contract min_inputs exceeds max_inputs"
1726                    ));
1727                }
1728            }
1729        }
1730        match node.kind.as_str() {
1731            "condition" => {
1732                let has_true = outgoing
1733                    .iter()
1734                    .any(|edge| edge.branch.as_deref() == Some("true"));
1735                let has_false = outgoing
1736                    .iter()
1737                    .any(|edge| edge.branch.as_deref() == Some("false"));
1738                if !has_true || !has_false {
1739                    errors.push(format!(
1740                        "node {node_id}: condition nodes require both 'true' and 'false' branch edges"
1741                    ));
1742                }
1743            }
1744            "fork" => {
1745                if outgoing.len() < 2 {
1746                    errors.push(format!(
1747                        "node {node_id}: fork nodes require at least two outgoing edges"
1748                    ));
1749                }
1750            }
1751            "join" => {
1752                if incoming < 2 {
1753                    warnings.push(format!(
1754                        "node {node_id}: join node has fewer than two incoming edges"
1755                    ));
1756                }
1757            }
1758            "map" => {
1759                if node.map_policy.items.is_empty()
1760                    && node.map_policy.item_artifact_kind.is_none()
1761                    && node.input_contract.input_kinds.is_empty()
1762                {
1763                    errors.push(format!(
1764                        "node {node_id}: map nodes require items, item_artifact_kind, or input_contract.input_kinds"
1765                    ));
1766                }
1767            }
1768            "reduce" => {
1769                if node.input_contract.input_kinds.is_empty() {
1770                    warnings.push(format!(
1771                        "node {node_id}: reduce node has no input_contract.input_kinds; it will consume all available artifacts"
1772                    ));
1773                }
1774            }
1775            _ => {}
1776        }
1777    }
1778
1779    if let Some(ceiling) = ceiling {
1780        if let Err(error) = ceiling.intersect(&graph.capability_policy) {
1781            errors.push(error);
1782        }
1783        for (node_id, node) in &graph.nodes {
1784            if let Err(error) = ceiling.intersect(&node.capability_policy) {
1785                errors.push(format!("node {node_id}: {error}"));
1786            }
1787        }
1788    }
1789
1790    WorkflowValidationReport {
1791        valid: errors.is_empty(),
1792        errors,
1793        warnings,
1794        reachable_nodes: reachable_nodes.into_iter().collect(),
1795    }
1796}
1797
1798fn reachable_nodes(graph: &WorkflowGraph) -> BTreeSet<String> {
1799    let mut seen = BTreeSet::new();
1800    let mut stack = vec![graph.entry.clone()];
1801    while let Some(node_id) = stack.pop() {
1802        if !seen.insert(node_id.clone()) {
1803            continue;
1804        }
1805        for edge in graph.edges.iter().filter(|edge| edge.from == node_id) {
1806            stack.push(edge.to.clone());
1807        }
1808    }
1809    seen
1810}
1811
1812pub fn select_artifacts(
1813    mut artifacts: Vec<ArtifactRecord>,
1814    policy: &ContextPolicy,
1815) -> Vec<ArtifactRecord> {
1816    artifacts.retain(|artifact| {
1817        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
1818            && !policy.exclude_kinds.contains(&artifact.kind)
1819            && (policy.include_stages.is_empty()
1820                || artifact
1821                    .stage
1822                    .as_ref()
1823                    .is_some_and(|stage| policy.include_stages.contains(stage)))
1824    });
1825    artifacts.sort_by(|a, b| {
1826        let b_pinned = policy.pinned_ids.contains(&b.id);
1827        let a_pinned = policy.pinned_ids.contains(&a.id);
1828        b_pinned
1829            .cmp(&a_pinned)
1830            .then_with(|| {
1831                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
1832                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
1833                b_prio_kind.cmp(&a_prio_kind)
1834            })
1835            .then_with(|| {
1836                b.priority
1837                    .unwrap_or_default()
1838                    .cmp(&a.priority.unwrap_or_default())
1839            })
1840            .then_with(|| {
1841                if policy.prefer_fresh {
1842                    freshness_rank(b.freshness.as_deref())
1843                        .cmp(&freshness_rank(a.freshness.as_deref()))
1844                } else {
1845                    std::cmp::Ordering::Equal
1846                }
1847            })
1848            .then_with(|| {
1849                if policy.prefer_recent {
1850                    b.created_at.cmp(&a.created_at)
1851                } else {
1852                    std::cmp::Ordering::Equal
1853                }
1854            })
1855            .then_with(|| {
1856                b.relevance
1857                    .partial_cmp(&a.relevance)
1858                    .unwrap_or(std::cmp::Ordering::Equal)
1859            })
1860            .then_with(|| {
1861                a.estimated_tokens
1862                    .unwrap_or(usize::MAX)
1863                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
1864            })
1865    });
1866
1867    let mut selected = Vec::new();
1868    let mut used_tokens = 0usize;
1869    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
1870    let effective_max_tokens = policy
1871        .max_tokens
1872        .map(|max| max.saturating_sub(reserve_tokens));
1873    for artifact in artifacts {
1874        if let Some(max_artifacts) = policy.max_artifacts {
1875            if selected.len() >= max_artifacts {
1876                break;
1877            }
1878        }
1879        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
1880        if let Some(max_tokens) = effective_max_tokens {
1881            if used_tokens + next_tokens > max_tokens {
1882                continue;
1883            }
1884        }
1885        used_tokens += next_tokens;
1886        selected.push(artifact);
1887    }
1888    selected
1889}
1890
1891pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
1892    let mut parts = Vec::new();
1893    for artifact in artifacts {
1894        let title = artifact
1895            .title
1896            .clone()
1897            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
1898        let body = artifact
1899            .text
1900            .clone()
1901            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
1902            .unwrap_or_default();
1903        match policy.render.as_deref() {
1904            Some("json") => {
1905                parts.push(
1906                    serde_json::json!({
1907                        "id": artifact.id,
1908                        "kind": artifact.kind,
1909                        "title": title,
1910                        "source": artifact.source,
1911                        "freshness": artifact.freshness,
1912                        "priority": artifact.priority,
1913                        "text": body,
1914                    })
1915                    .to_string(),
1916                );
1917            }
1918            _ => parts.push(format!(
1919                "[{title}] kind={} source={} freshness={} priority={}\n{}",
1920                artifact.kind,
1921                artifact
1922                    .source
1923                    .clone()
1924                    .unwrap_or_else(|| "unknown".to_string()),
1925                artifact
1926                    .freshness
1927                    .clone()
1928                    .unwrap_or_else(|| "normal".to_string()),
1929                artifact.priority.unwrap_or_default(),
1930                body
1931            )),
1932        }
1933    }
1934    parts.join("\n\n")
1935}
1936
1937pub fn normalize_artifact(value: &VmValue) -> Result<ArtifactRecord, VmError> {
1938    let artifact: ArtifactRecord = parse_json_value(value)?;
1939    Ok(artifact.normalize())
1940}
1941
1942pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1943    let json = vm_value_to_json(value);
1944    let payload = json.to_string();
1945    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1946    let mut tracker = serde_path_to_error::Track::new();
1947    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1948    let mut run: RunRecord = RunRecord::deserialize(path_deserializer).map_err(|error| {
1949        let snippet = if payload.len() > 600 {
1950            format!("{}...", &payload[..600])
1951        } else {
1952            payload.clone()
1953        };
1954        VmError::Runtime(format!(
1955            "orchestration parse error at {}: {} | payload={}",
1956            tracker.path(),
1957            error,
1958            snippet
1959        ))
1960    })?;
1961    if run.type_name.is_empty() {
1962        run.type_name = "run_record".to_string();
1963    }
1964    if run.id.is_empty() {
1965        run.id = new_id("run");
1966    }
1967    if run.started_at.is_empty() {
1968        run.started_at = now_rfc3339();
1969    }
1970    if run.status.is_empty() {
1971        run.status = "running".to_string();
1972    }
1973    if run.root_run_id.is_none() {
1974        run.root_run_id = Some(run.id.clone());
1975    }
1976    if run.replay_fixture.is_none() {
1977        run.replay_fixture = Some(replay_fixture_from_run(&run));
1978    }
1979    Ok(run)
1980}
1981
1982pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1983    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1984    if manifest.type_name.is_empty() {
1985        manifest.type_name = "eval_suite_manifest".to_string();
1986    }
1987    if manifest.id.is_empty() {
1988        manifest.id = new_id("eval_suite");
1989    }
1990    Ok(manifest)
1991}
1992
1993fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1994    let content = std::fs::read_to_string(path)
1995        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1996    serde_json::from_str(&content)
1997        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1998}
1999
2000fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
2001    let path_buf = PathBuf::from(path);
2002    if path_buf.is_absolute() {
2003        path_buf
2004    } else if let Some(base_dir) = base_dir {
2005        base_dir.join(path_buf)
2006    } else {
2007        path_buf
2008    }
2009}
2010
2011pub fn evaluate_run_suite_manifest(
2012    manifest: &EvalSuiteManifest,
2013) -> Result<ReplayEvalSuiteReport, VmError> {
2014    let base_dir = manifest.base_dir.as_deref().map(Path::new);
2015    let mut reports = Vec::new();
2016    for case in &manifest.cases {
2017        let run_path = resolve_manifest_path(base_dir, &case.run_path);
2018        let run = load_run_record(&run_path)?;
2019        let fixture = match &case.fixture_path {
2020            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
2021            None => run
2022                .replay_fixture
2023                .clone()
2024                .unwrap_or_else(|| replay_fixture_from_run(&run)),
2025        };
2026        let eval = evaluate_run_against_fixture(&run, &fixture);
2027        let mut pass = eval.pass;
2028        let mut failures = eval.failures;
2029        let comparison = match &case.compare_to {
2030            Some(path) => {
2031                let baseline_path = resolve_manifest_path(base_dir, path);
2032                let baseline = load_run_record(&baseline_path)?;
2033                let diff = diff_run_records(&baseline, &run);
2034                if !diff.identical {
2035                    pass = false;
2036                    failures.push(format!(
2037                        "run differs from baseline {} with {} stage changes",
2038                        baseline_path.display(),
2039                        diff.stage_diffs.len()
2040                    ));
2041                }
2042                Some(diff)
2043            }
2044            None => None,
2045        };
2046        reports.push(ReplayEvalCaseReport {
2047            run_id: run.id.clone(),
2048            workflow_id: run.workflow_id.clone(),
2049            label: case.label.clone(),
2050            pass,
2051            failures,
2052            stage_count: eval.stage_count,
2053            source_path: Some(run_path.display().to_string()),
2054            comparison,
2055        });
2056    }
2057    let total = reports.len();
2058    let passed = reports.iter().filter(|report| report.pass).count();
2059    let failed = total.saturating_sub(passed);
2060    Ok(ReplayEvalSuiteReport {
2061        pass: failed == 0,
2062        total,
2063        passed,
2064        failed,
2065        cases: reports,
2066    })
2067}
2068
2069pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2070    let before_lines: Vec<&str> = before.lines().collect();
2071    let after_lines: Vec<&str> = after.lines().collect();
2072    let mut table = vec![vec![0usize; after_lines.len() + 1]; before_lines.len() + 1];
2073    for i in (0..before_lines.len()).rev() {
2074        for j in (0..after_lines.len()).rev() {
2075            table[i][j] = if before_lines[i] == after_lines[j] {
2076                table[i + 1][j + 1] + 1
2077            } else {
2078                table[i + 1][j].max(table[i][j + 1])
2079            };
2080        }
2081    }
2082
2083    let mut diff = String::new();
2084    let file = path.unwrap_or("artifact");
2085    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2086    let mut i = 0;
2087    let mut j = 0;
2088    while i < before_lines.len() && j < after_lines.len() {
2089        if before_lines[i] == after_lines[j] {
2090            diff.push_str(&format!(" {}\n", before_lines[i]));
2091            i += 1;
2092            j += 1;
2093        } else if table[i + 1][j] >= table[i][j + 1] {
2094            diff.push_str(&format!("-{}\n", before_lines[i]));
2095            i += 1;
2096        } else {
2097            diff.push_str(&format!("+{}\n", after_lines[j]));
2098            j += 1;
2099        }
2100    }
2101    while i < before_lines.len() {
2102        diff.push_str(&format!("-{}\n", before_lines[i]));
2103        i += 1;
2104    }
2105    while j < after_lines.len() {
2106        diff.push_str(&format!("+{}\n", after_lines[j]));
2107        j += 1;
2108    }
2109    diff
2110}
2111
2112pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2113    let path = path
2114        .map(PathBuf::from)
2115        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2116    if let Some(parent) = path.parent() {
2117        std::fs::create_dir_all(parent)
2118            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2119    }
2120    let json = serde_json::to_string_pretty(run)
2121        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2122    // Atomic write: write to .tmp then rename to prevent corruption on kill.
2123    let tmp_path = path.with_extension("json.tmp");
2124    std::fs::write(&tmp_path, &json)
2125        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2126    std::fs::rename(&tmp_path, &path).map_err(|e| {
2127        // Fallback: if rename fails (cross-device), write directly.
2128        let _ = std::fs::write(&path, &json);
2129        VmError::Runtime(format!("failed to finalize run record: {e}"))
2130    })?;
2131    Ok(path.to_string_lossy().to_string())
2132}
2133
2134pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2135    let content = std::fs::read_to_string(path)
2136        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2137    serde_json::from_str(&content)
2138        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))
2139}
2140
2141pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2142    ReplayFixture {
2143        type_name: "replay_fixture".to_string(),
2144        id: new_id("fixture"),
2145        source_run_id: run.id.clone(),
2146        workflow_id: run.workflow_id.clone(),
2147        workflow_name: run.workflow_name.clone(),
2148        created_at: now_rfc3339(),
2149        expected_status: run.status.clone(),
2150        stage_assertions: run
2151            .stages
2152            .iter()
2153            .map(|stage| ReplayStageAssertion {
2154                node_id: stage.node_id.clone(),
2155                expected_status: stage.status.clone(),
2156                expected_outcome: stage.outcome.clone(),
2157                expected_branch: stage.branch.clone(),
2158                required_artifact_kinds: stage
2159                    .artifacts
2160                    .iter()
2161                    .map(|artifact| artifact.kind.clone())
2162                    .collect(),
2163                visible_text_contains: stage
2164                    .visible_text
2165                    .as_ref()
2166                    .filter(|text| !text.is_empty())
2167                    .map(|text| text.chars().take(80).collect()),
2168            })
2169            .collect(),
2170    }
2171}
2172
2173pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2174    let mut failures = Vec::new();
2175    if run.status != fixture.expected_status {
2176        failures.push(format!(
2177            "run status mismatch: expected {}, got {}",
2178            fixture.expected_status, run.status
2179        ));
2180    }
2181    for assertion in &fixture.stage_assertions {
2182        let Some(stage) = run
2183            .stages
2184            .iter()
2185            .find(|stage| stage.node_id == assertion.node_id)
2186        else {
2187            failures.push(format!("missing stage {}", assertion.node_id));
2188            continue;
2189        };
2190        if stage.status != assertion.expected_status {
2191            failures.push(format!(
2192                "stage {} status mismatch: expected {}, got {}",
2193                assertion.node_id, assertion.expected_status, stage.status
2194            ));
2195        }
2196        if stage.outcome != assertion.expected_outcome {
2197            failures.push(format!(
2198                "stage {} outcome mismatch: expected {}, got {}",
2199                assertion.node_id, assertion.expected_outcome, stage.outcome
2200            ));
2201        }
2202        if stage.branch != assertion.expected_branch {
2203            failures.push(format!(
2204                "stage {} branch mismatch: expected {:?}, got {:?}",
2205                assertion.node_id, assertion.expected_branch, stage.branch
2206            ));
2207        }
2208        for required_kind in &assertion.required_artifact_kinds {
2209            if !stage
2210                .artifacts
2211                .iter()
2212                .any(|artifact| &artifact.kind == required_kind)
2213            {
2214                failures.push(format!(
2215                    "stage {} missing artifact kind {}",
2216                    assertion.node_id, required_kind
2217                ));
2218            }
2219        }
2220        if let Some(snippet) = &assertion.visible_text_contains {
2221            let actual = stage.visible_text.clone().unwrap_or_default();
2222            if !actual.contains(snippet) {
2223                failures.push(format!(
2224                    "stage {} visible text does not contain expected snippet {:?}",
2225                    assertion.node_id, snippet
2226                ));
2227            }
2228        }
2229    }
2230
2231    ReplayEvalReport {
2232        pass: failures.is_empty(),
2233        failures,
2234        stage_count: run.stages.len(),
2235    }
2236}
2237
2238pub fn evaluate_run_suite(
2239    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2240) -> ReplayEvalSuiteReport {
2241    let mut reports = Vec::new();
2242    for (run, fixture, source_path) in cases {
2243        let report = evaluate_run_against_fixture(&run, &fixture);
2244        reports.push(ReplayEvalCaseReport {
2245            run_id: run.id.clone(),
2246            workflow_id: run.workflow_id.clone(),
2247            label: None,
2248            pass: report.pass,
2249            failures: report.failures,
2250            stage_count: report.stage_count,
2251            source_path,
2252            comparison: None,
2253        });
2254    }
2255    let total = reports.len();
2256    let passed = reports.iter().filter(|report| report.pass).count();
2257    let failed = total.saturating_sub(passed);
2258    ReplayEvalSuiteReport {
2259        pass: failed == 0,
2260        total,
2261        passed,
2262        failed,
2263        cases: reports,
2264    }
2265}
2266
2267pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2268    let mut stage_diffs = Vec::new();
2269    let mut all_node_ids = BTreeSet::new();
2270    all_node_ids.extend(left.stages.iter().map(|stage| stage.node_id.clone()));
2271    all_node_ids.extend(right.stages.iter().map(|stage| stage.node_id.clone()));
2272
2273    for node_id in all_node_ids {
2274        let left_stage = left.stages.iter().find(|stage| stage.node_id == node_id);
2275        let right_stage = right.stages.iter().find(|stage| stage.node_id == node_id);
2276        match (left_stage, right_stage) {
2277            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2278                node_id,
2279                change: "removed".to_string(),
2280                details: vec!["stage missing from right run".to_string()],
2281            }),
2282            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2283                node_id,
2284                change: "added".to_string(),
2285                details: vec!["stage missing from left run".to_string()],
2286            }),
2287            (Some(left_stage), Some(right_stage)) => {
2288                let mut details = Vec::new();
2289                if left_stage.status != right_stage.status {
2290                    details.push(format!(
2291                        "status: {} -> {}",
2292                        left_stage.status, right_stage.status
2293                    ));
2294                }
2295                if left_stage.outcome != right_stage.outcome {
2296                    details.push(format!(
2297                        "outcome: {} -> {}",
2298                        left_stage.outcome, right_stage.outcome
2299                    ));
2300                }
2301                if left_stage.branch != right_stage.branch {
2302                    details.push(format!(
2303                        "branch: {:?} -> {:?}",
2304                        left_stage.branch, right_stage.branch
2305                    ));
2306                }
2307                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2308                {
2309                    details.push(format!(
2310                        "produced_artifacts: {} -> {}",
2311                        left_stage.produced_artifact_ids.len(),
2312                        right_stage.produced_artifact_ids.len()
2313                    ));
2314                }
2315                if left_stage.artifacts.len() != right_stage.artifacts.len() {
2316                    details.push(format!(
2317                        "artifact_records: {} -> {}",
2318                        left_stage.artifacts.len(),
2319                        right_stage.artifacts.len()
2320                    ));
2321                }
2322                if !details.is_empty() {
2323                    stage_diffs.push(RunStageDiffRecord {
2324                        node_id,
2325                        change: "changed".to_string(),
2326                        details,
2327                    });
2328                }
2329            }
2330            (None, None) => {}
2331        }
2332    }
2333
2334    let status_changed = left.status != right.status;
2335    let identical = !status_changed
2336        && stage_diffs.is_empty()
2337        && left.transitions.len() == right.transitions.len()
2338        && left.artifacts.len() == right.artifacts.len()
2339        && left.checkpoints.len() == right.checkpoints.len();
2340
2341    RunDiffReport {
2342        left_run_id: left.id.clone(),
2343        right_run_id: right.id.clone(),
2344        identical,
2345        status_changed,
2346        left_status: left.status.clone(),
2347        right_status: right.status.clone(),
2348        stage_diffs,
2349        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
2350        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
2351        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
2352    }
2353}
2354
2355pub fn push_execution_policy(policy: CapabilityPolicy) {
2356    EXECUTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
2357}
2358
2359pub fn pop_execution_policy() {
2360    EXECUTION_POLICY_STACK.with(|stack| {
2361        stack.borrow_mut().pop();
2362    });
2363}
2364
2365pub fn current_execution_policy() -> Option<CapabilityPolicy> {
2366    EXECUTION_POLICY_STACK.with(|stack| stack.borrow().last().cloned())
2367}
2368
2369pub fn current_tool_metadata(tool: &str) -> Option<ToolRuntimePolicyMetadata> {
2370    current_execution_policy().and_then(|policy| policy.tool_metadata.get(tool).cloned())
2371}
2372
2373fn policy_allows_tool(policy: &CapabilityPolicy, tool: &str) -> bool {
2374    policy.tools.is_empty() || policy.tools.iter().any(|allowed| allowed == tool)
2375}
2376
2377fn policy_allows_capability(policy: &CapabilityPolicy, capability: &str, op: &str) -> bool {
2378    policy.capabilities.is_empty()
2379        || policy
2380            .capabilities
2381            .get(capability)
2382            .is_some_and(|ops| ops.is_empty() || ops.iter().any(|allowed| allowed == op))
2383}
2384
2385fn policy_allows_side_effect(policy: &CapabilityPolicy, requested: &str) -> bool {
2386    fn rank(v: &str) -> usize {
2387        match v {
2388            "none" => 0,
2389            "read_only" => 1,
2390            "workspace_write" => 2,
2391            "process_exec" => 3,
2392            "network" => 4,
2393            _ => 5,
2394        }
2395    }
2396    policy
2397        .side_effect_level
2398        .as_ref()
2399        .map(|allowed| rank(allowed) >= rank(requested))
2400        .unwrap_or(true)
2401}
2402
2403fn reject_policy(reason: String) -> Result<(), VmError> {
2404    Err(VmError::CategorizedError {
2405        message: reason,
2406        category: crate::value::ErrorCategory::ToolRejected,
2407    })
2408}
2409
2410fn fallback_mutation_classification(tool_name: &str) -> String {
2411    let lower = tool_name.to_ascii_lowercase();
2412    if lower.starts_with("mcp_") {
2413        return "host_defined".to_string();
2414    }
2415    if lower == "exec"
2416        || lower == "shell"
2417        || lower == "exec_at"
2418        || lower == "shell_at"
2419        || lower == "run"
2420        || lower.starts_with("run_")
2421    {
2422        return "ambient_side_effect".to_string();
2423    }
2424    if lower.starts_with("delete")
2425        || lower.starts_with("remove")
2426        || lower.starts_with("move")
2427        || lower.starts_with("rename")
2428    {
2429        return "destructive".to_string();
2430    }
2431    if lower.contains("write")
2432        || lower.contains("edit")
2433        || lower.contains("patch")
2434        || lower.contains("create")
2435        || lower.contains("scaffold")
2436        || lower.starts_with("insert")
2437        || lower.starts_with("replace")
2438        || lower == "add_import"
2439    {
2440        return "apply_workspace".to_string();
2441    }
2442    "read_only".to_string()
2443}
2444
2445pub fn current_tool_mutation_classification(tool_name: &str) -> String {
2446    current_tool_metadata(tool_name)
2447        .and_then(|metadata| metadata.mutation_classification)
2448        .unwrap_or_else(|| fallback_mutation_classification(tool_name))
2449}
2450
2451pub fn current_tool_declared_paths(tool_name: &str, args: &serde_json::Value) -> Vec<String> {
2452    let Some(map) = args.as_object() else {
2453        return Vec::new();
2454    };
2455    let path_keys = current_tool_metadata(tool_name)
2456        .map(|metadata| metadata.path_params)
2457        .filter(|keys| !keys.is_empty())
2458        .unwrap_or_else(|| {
2459            vec![
2460                "path".to_string(),
2461                "file".to_string(),
2462                "cwd".to_string(),
2463                "repo".to_string(),
2464                "target".to_string(),
2465                "destination".to_string(),
2466            ]
2467        });
2468    let mut paths = Vec::new();
2469    for key in path_keys {
2470        if let Some(value) = map.get(&key).and_then(|value| value.as_str()) {
2471            if !value.is_empty() {
2472                paths.push(value.to_string());
2473            }
2474        }
2475    }
2476    if let Some(items) = map.get("paths").and_then(|value| value.as_array()) {
2477        for item in items {
2478            if let Some(value) = item.as_str() {
2479                if !value.is_empty() {
2480                    paths.push(value.to_string());
2481                }
2482            }
2483        }
2484    }
2485    paths.sort();
2486    paths.dedup();
2487    paths
2488}
2489
2490pub fn enforce_current_policy_for_builtin(name: &str, args: &[VmValue]) -> Result<(), VmError> {
2491    let Some(policy) = current_execution_policy() else {
2492        return Ok(());
2493    };
2494    match name {
2495        "read" | "read_file" => {
2496            if !policy_allows_tool(&policy, name)
2497                || !policy_allows_capability(&policy, "workspace", "read_text")
2498            {
2499                return reject_policy(format!(
2500                    "builtin '{name}' exceeds workspace.read_text ceiling"
2501                ));
2502            }
2503        }
2504        "search" | "list_dir" => {
2505            if !policy_allows_tool(&policy, name)
2506                || !policy_allows_capability(&policy, "workspace", "list")
2507            {
2508                return reject_policy(format!("builtin '{name}' exceeds workspace.list ceiling"));
2509            }
2510        }
2511        "file_exists" | "stat" => {
2512            if !policy_allows_capability(&policy, "workspace", "exists") {
2513                return reject_policy(format!("builtin '{name}' exceeds workspace.exists ceiling"));
2514            }
2515        }
2516        "edit" | "write_file" | "append_file" | "mkdir" | "copy_file" => {
2517            if !policy_allows_tool(&policy, "edit")
2518                || !policy_allows_capability(&policy, "workspace", "write_text")
2519                || !policy_allows_side_effect(&policy, "workspace_write")
2520            {
2521                return reject_policy(format!("builtin '{name}' exceeds workspace write ceiling"));
2522            }
2523        }
2524        "delete_file" => {
2525            if !policy_allows_capability(&policy, "workspace", "delete")
2526                || !policy_allows_side_effect(&policy, "workspace_write")
2527            {
2528                return reject_policy(
2529                    "builtin 'delete_file' exceeds workspace.delete ceiling".to_string(),
2530                );
2531            }
2532        }
2533        "apply_edit" => {
2534            if !policy_allows_capability(&policy, "workspace", "apply_edit")
2535                || !policy_allows_side_effect(&policy, "workspace_write")
2536            {
2537                return reject_policy(
2538                    "builtin 'apply_edit' exceeds workspace.apply_edit ceiling".to_string(),
2539                );
2540            }
2541        }
2542        "exec" | "exec_at" | "shell" | "shell_at" | "run_command" => {
2543            if !policy_allows_tool(&policy, "run")
2544                || !policy_allows_capability(&policy, "process", "exec")
2545                || !policy_allows_side_effect(&policy, "process_exec")
2546            {
2547                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2548            }
2549        }
2550        "http_get" | "http_post" | "http_put" | "http_patch" | "http_delete" | "http_request" => {
2551            if !policy_allows_side_effect(&policy, "network") {
2552                return reject_policy(format!("builtin '{name}' exceeds network ceiling"));
2553            }
2554        }
2555        "mcp_connect"
2556        | "mcp_call"
2557        | "mcp_list_tools"
2558        | "mcp_list_resources"
2559        | "mcp_list_resource_templates"
2560        | "mcp_read_resource"
2561        | "mcp_list_prompts"
2562        | "mcp_get_prompt"
2563        | "mcp_server_info"
2564        | "mcp_disconnect" => {
2565            if !policy_allows_tool(&policy, "run")
2566                || !policy_allows_capability(&policy, "process", "exec")
2567                || !policy_allows_side_effect(&policy, "process_exec")
2568            {
2569                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2570            }
2571        }
2572        "host_call" => {
2573            let name = args.first().map(|v| v.display()).unwrap_or_default();
2574            let Some((capability, op)) = name.split_once('.') else {
2575                return reject_policy(format!(
2576                    "host_call '{name}' must use capability.operation naming"
2577                ));
2578            };
2579            if !policy_allows_capability(&policy, capability, op) {
2580                return reject_policy(format!(
2581                    "host_call {capability}.{op} exceeds capability ceiling"
2582                ));
2583            }
2584            let requested_side_effect = match (capability, op) {
2585                ("workspace", "write_text" | "apply_edit" | "delete") => "workspace_write",
2586                ("process", "exec") => "process_exec",
2587                _ => "read_only",
2588            };
2589            if !policy_allows_side_effect(&policy, requested_side_effect) {
2590                return reject_policy(format!(
2591                    "host_call {capability}.{op} exceeds side-effect ceiling"
2592                ));
2593            }
2594        }
2595        _ => {}
2596    }
2597    Ok(())
2598}
2599
2600pub fn enforce_current_policy_for_bridge_builtin(name: &str) -> Result<(), VmError> {
2601    if current_execution_policy().is_some() {
2602        return reject_policy(format!(
2603            "bridged builtin '{name}' exceeds execution policy; declare an explicit capability/tool surface instead"
2604        ));
2605    }
2606    Ok(())
2607}
2608
2609pub fn enforce_current_policy_for_tool(tool_name: &str) -> Result<(), VmError> {
2610    let Some(policy) = current_execution_policy() else {
2611        return Ok(());
2612    };
2613    if !policy_allows_tool(&policy, tool_name) {
2614        return reject_policy(format!("tool '{tool_name}' exceeds tool ceiling"));
2615    }
2616    if let Some(metadata) = policy.tool_metadata.get(tool_name) {
2617        for (capability, ops) in &metadata.capabilities {
2618            for op in ops {
2619                if !policy_allows_capability(&policy, capability, op) {
2620                    return reject_policy(format!(
2621                        "tool '{tool_name}' exceeds capability ceiling: {capability}.{op}"
2622                    ));
2623                }
2624            }
2625        }
2626        if let Some(side_effect_level) = metadata.side_effect_level.as_deref() {
2627            if !policy_allows_side_effect(&policy, side_effect_level) {
2628                return reject_policy(format!(
2629                    "tool '{tool_name}' exceeds side-effect ceiling: {side_effect_level}"
2630                ));
2631            }
2632        }
2633    }
2634    Ok(())
2635}
2636
2637fn compact_transcript(transcript: &VmValue, keep_last: usize) -> Option<VmValue> {
2638    let dict = transcript.as_dict()?;
2639    let messages = match dict.get("messages") {
2640        Some(VmValue::List(list)) => list.iter().cloned().collect::<Vec<_>>(),
2641        _ => Vec::new(),
2642    };
2643    let retained = messages
2644        .into_iter()
2645        .rev()
2646        .take(keep_last)
2647        .collect::<Vec<_>>()
2648        .into_iter()
2649        .rev()
2650        .collect::<Vec<_>>();
2651    let mut compacted = dict.clone();
2652    compacted.insert(
2653        "messages".to_string(),
2654        VmValue::List(Rc::new(retained.clone())),
2655    );
2656    compacted.insert(
2657        "events".to_string(),
2658        VmValue::List(Rc::new(
2659            crate::llm::helpers::transcript_events_from_messages(&retained),
2660        )),
2661    );
2662    Some(VmValue::Dict(Rc::new(compacted)))
2663}
2664
2665fn redact_transcript_visibility(transcript: &VmValue, visibility: Option<&str>) -> Option<VmValue> {
2666    let Some(visibility) = visibility else {
2667        return Some(transcript.clone());
2668    };
2669    if visibility != "public" && visibility != "public_only" {
2670        return Some(transcript.clone());
2671    }
2672    let dict = transcript.as_dict()?;
2673    let public_messages = match dict.get("messages") {
2674        Some(VmValue::List(list)) => list
2675            .iter()
2676            .filter(|message| {
2677                message
2678                    .as_dict()
2679                    .and_then(|d| d.get("role"))
2680                    .map(|v| v.display())
2681                    .map(|role| role != "tool_result")
2682                    .unwrap_or(true)
2683            })
2684            .cloned()
2685            .collect::<Vec<_>>(),
2686        _ => Vec::new(),
2687    };
2688    let public_events = match dict.get("events") {
2689        Some(VmValue::List(list)) => list
2690            .iter()
2691            .filter(|event| {
2692                event
2693                    .as_dict()
2694                    .and_then(|d| d.get("visibility"))
2695                    .map(|v| v.display())
2696                    .map(|value| value == "public")
2697                    .unwrap_or(true)
2698            })
2699            .cloned()
2700            .collect::<Vec<_>>(),
2701        _ => Vec::new(),
2702    };
2703    let mut redacted = dict.clone();
2704    redacted.insert(
2705        "messages".to_string(),
2706        VmValue::List(Rc::new(public_messages)),
2707    );
2708    redacted.insert("events".to_string(), VmValue::List(Rc::new(public_events)));
2709    Some(VmValue::Dict(Rc::new(redacted)))
2710}
2711
2712pub(crate) fn apply_input_transcript_policy(
2713    transcript: Option<VmValue>,
2714    policy: &TranscriptPolicy,
2715) -> Option<VmValue> {
2716    let mut transcript = transcript;
2717    match policy.mode.as_deref() {
2718        Some("reset") => return None,
2719        Some("fork") => {
2720            if let Some(VmValue::Dict(dict)) = transcript.as_ref() {
2721                let mut forked = dict.as_ref().clone();
2722                forked.insert(
2723                    "id".to_string(),
2724                    VmValue::String(Rc::from(new_id("transcript"))),
2725                );
2726                transcript = Some(VmValue::Dict(Rc::new(forked)));
2727            }
2728        }
2729        _ => {}
2730    }
2731    if policy.compact {
2732        let keep_last = policy.keep_last.unwrap_or(6);
2733        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2734    }
2735    transcript
2736}
2737
2738fn apply_output_transcript_policy(
2739    transcript: Option<VmValue>,
2740    policy: &TranscriptPolicy,
2741) -> Option<VmValue> {
2742    let mut transcript = transcript;
2743    if policy.compact {
2744        let keep_last = policy.keep_last.unwrap_or(6);
2745        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2746    }
2747    transcript.and_then(|value| redact_transcript_visibility(&value, policy.visibility.as_deref()))
2748}
2749
2750pub async fn execute_stage_node(
2751    node_id: &str,
2752    node: &WorkflowNode,
2753    task: &str,
2754    artifacts: &[ArtifactRecord],
2755    transcript: Option<VmValue>,
2756) -> Result<(serde_json::Value, Vec<ArtifactRecord>, Option<VmValue>), VmError> {
2757    let mut selection_policy = node.context_policy.clone();
2758    if selection_policy.include_kinds.is_empty() && !node.input_contract.input_kinds.is_empty() {
2759        selection_policy.include_kinds = node.input_contract.input_kinds.clone();
2760    }
2761    let selected = select_artifacts_adaptive(artifacts.to_vec(), &selection_policy);
2762    let rendered_context = render_artifacts_context(&selected, &node.context_policy);
2763    let transcript = apply_input_transcript_policy(transcript, &node.transcript_policy);
2764    if node.input_contract.require_transcript && transcript.is_none() {
2765        return Err(VmError::Runtime(format!(
2766            "workflow stage {node_id} requires transcript input"
2767        )));
2768    }
2769    if let Some(min_inputs) = node.input_contract.min_inputs {
2770        if selected.len() < min_inputs {
2771            return Err(VmError::Runtime(format!(
2772                "workflow stage {node_id} requires at least {min_inputs} input artifacts"
2773            )));
2774        }
2775    }
2776    if let Some(max_inputs) = node.input_contract.max_inputs {
2777        if selected.len() > max_inputs {
2778            return Err(VmError::Runtime(format!(
2779                "workflow stage {node_id} accepts at most {max_inputs} input artifacts"
2780            )));
2781        }
2782    }
2783    let prompt = if rendered_context.is_empty() {
2784        task.to_string()
2785    } else {
2786        format!(
2787            "{rendered_context}\n\n{}:\n{task}",
2788            node.task_label
2789                .clone()
2790                .unwrap_or_else(|| "Task".to_string())
2791        )
2792    };
2793
2794    let tool_format = std::env::var("HARN_AGENT_TOOL_FORMAT")
2795        .ok()
2796        .filter(|value| !value.trim().is_empty())
2797        .unwrap_or_else(|| "text".to_string());
2798    let mut llm_result = if node.kind == "verify" {
2799        if let Some(command) = node
2800            .verify
2801            .as_ref()
2802            .and_then(|verify| verify.as_object())
2803            .and_then(|verify| verify.get("command"))
2804            .and_then(|value| value.as_str())
2805            .map(str::trim)
2806            .filter(|value| !value.is_empty())
2807        {
2808            let mut process = if cfg!(target_os = "windows") {
2809                let mut cmd = tokio::process::Command::new("cmd");
2810                cmd.arg("/C").arg(command);
2811                cmd
2812            } else {
2813                let mut cmd = tokio::process::Command::new("/bin/sh");
2814                cmd.arg("-lc").arg(command);
2815                cmd
2816            };
2817            process.stdin(std::process::Stdio::null());
2818            if let Some(context) = crate::stdlib::process::current_execution_context() {
2819                if let Some(cwd) = context.cwd.filter(|cwd| !cwd.is_empty()) {
2820                    process.current_dir(cwd);
2821                }
2822                if !context.env.is_empty() {
2823                    process.envs(context.env);
2824                }
2825            }
2826            let output = process
2827                .output()
2828                .await
2829                .map_err(|e| VmError::Runtime(format!("workflow verify exec failed: {e}")))?;
2830            let stdout = String::from_utf8_lossy(&output.stdout).to_string();
2831            let stderr = String::from_utf8_lossy(&output.stderr).to_string();
2832            let combined = if stderr.is_empty() {
2833                stdout.clone()
2834            } else if stdout.is_empty() {
2835                stderr.clone()
2836            } else {
2837                format!("{stdout}\n{stderr}")
2838            };
2839            serde_json::json!({
2840                "status": "completed",
2841                "text": combined,
2842                "visible_text": combined,
2843                "command": command,
2844                "stdout": stdout,
2845                "stderr": stderr,
2846                "exit_status": output.status.code().unwrap_or(-1),
2847                "success": output.status.success(),
2848            })
2849        } else {
2850            serde_json::json!({
2851                "status": "completed",
2852                "text": "",
2853                "visible_text": "",
2854            })
2855        }
2856    } else {
2857        let mut options = BTreeMap::new();
2858        if let Some(provider) = &node.model_policy.provider {
2859            options.insert(
2860                "provider".to_string(),
2861                VmValue::String(Rc::from(provider.clone())),
2862            );
2863        }
2864        if let Some(model) = &node.model_policy.model {
2865            options.insert(
2866                "model".to_string(),
2867                VmValue::String(Rc::from(model.clone())),
2868            );
2869        }
2870        if let Some(model_tier) = &node.model_policy.model_tier {
2871            options.insert(
2872                "model_tier".to_string(),
2873                VmValue::String(Rc::from(model_tier.clone())),
2874            );
2875        }
2876        if let Some(temperature) = node.model_policy.temperature {
2877            options.insert("temperature".to_string(), VmValue::Float(temperature));
2878        }
2879        if let Some(max_tokens) = node.model_policy.max_tokens {
2880            options.insert("max_tokens".to_string(), VmValue::Int(max_tokens));
2881        }
2882        let tool_names = workflow_tool_names(&node.tools);
2883        if !matches!(node.tools, serde_json::Value::Null) && !tool_names.is_empty() {
2884            options.insert(
2885                "tools".to_string(),
2886                crate::stdlib::json_to_vm_value(&node.tools),
2887            );
2888        }
2889        if let Some(transcript) = transcript.clone() {
2890            options.insert("transcript".to_string(), transcript);
2891        }
2892
2893        let args = vec![
2894            VmValue::String(Rc::from(prompt.clone())),
2895            node.system
2896                .clone()
2897                .map(|s| VmValue::String(Rc::from(s)))
2898                .unwrap_or(VmValue::Nil),
2899            VmValue::Dict(Rc::new(options)),
2900        ];
2901        let mut opts = extract_llm_options(&args)?;
2902
2903        if node.mode.as_deref() == Some("agent") || !tool_names.is_empty() {
2904            crate::llm::run_agent_loop_internal(
2905                &mut opts,
2906                crate::llm::AgentLoopConfig {
2907                    persistent: true,
2908                    max_iterations: 12,
2909                    max_nudges: 3,
2910                    nudge: None,
2911                    done_sentinel: node.done_sentinel.clone(),
2912                    break_unless_phase: None,
2913                    tool_retries: 0,
2914                    tool_backoff_ms: 1000,
2915                    tool_format: tool_format.clone(),
2916                    auto_compact: None,
2917                    context_callback: None,
2918                    policy: None,
2919                    daemon: false,
2920                    llm_retries: 2,
2921                    llm_backoff_ms: 2000,
2922                },
2923            )
2924            .await?
2925        } else {
2926            let result = vm_call_llm_full(&opts).await?;
2927            crate::llm::agent_loop_result_from_llm(&result, opts)
2928        }
2929    };
2930    if let Some(payload) = llm_result.as_object_mut() {
2931        payload.insert("prompt".to_string(), serde_json::json!(prompt));
2932        payload.insert(
2933            "system_prompt".to_string(),
2934            serde_json::json!(node.system.clone().unwrap_or_default()),
2935        );
2936        payload.insert(
2937            "rendered_context".to_string(),
2938            serde_json::json!(rendered_context),
2939        );
2940        payload.insert(
2941            "selected_artifact_ids".to_string(),
2942            serde_json::json!(selected
2943                .iter()
2944                .map(|artifact| artifact.id.clone())
2945                .collect::<Vec<_>>()),
2946        );
2947        payload.insert(
2948            "selected_artifact_titles".to_string(),
2949            serde_json::json!(selected
2950                .iter()
2951                .map(|artifact| artifact.title.clone())
2952                .collect::<Vec<_>>()),
2953        );
2954        payload.insert(
2955            "tool_calling_mode".to_string(),
2956            serde_json::json!(tool_format.clone()),
2957        );
2958    }
2959
2960    let visible_text = llm_result["text"].as_str().unwrap_or_default().to_string();
2961    let transcript = llm_result
2962        .get("transcript")
2963        .cloned()
2964        .map(|value| crate::stdlib::json_to_vm_value(&value));
2965    let transcript = apply_output_transcript_policy(transcript, &node.transcript_policy);
2966    let output_kind = node
2967        .output_contract
2968        .output_kinds
2969        .first()
2970        .cloned()
2971        .unwrap_or_else(|| {
2972            if node.kind == "verify" {
2973                "verification_result".to_string()
2974            } else {
2975                "artifact".to_string()
2976            }
2977        });
2978    let mut metadata = BTreeMap::new();
2979    metadata.insert(
2980        "input_artifact_ids".to_string(),
2981        serde_json::json!(selected
2982            .iter()
2983            .map(|artifact| artifact.id.clone())
2984            .collect::<Vec<_>>()),
2985    );
2986    metadata.insert("node_kind".to_string(), serde_json::json!(node.kind));
2987    let artifact = ArtifactRecord {
2988        type_name: "artifact".to_string(),
2989        id: new_id("artifact"),
2990        kind: output_kind,
2991        title: Some(format!("stage {node_id} output")),
2992        text: Some(visible_text),
2993        data: Some(llm_result.clone()),
2994        source: Some(node_id.to_string()),
2995        created_at: now_rfc3339(),
2996        freshness: Some("fresh".to_string()),
2997        priority: None,
2998        lineage: selected
2999            .iter()
3000            .map(|artifact| artifact.id.clone())
3001            .collect(),
3002        relevance: Some(1.0),
3003        estimated_tokens: None,
3004        stage: Some(node_id.to_string()),
3005        metadata,
3006    }
3007    .normalize();
3008
3009    Ok((llm_result, vec![artifact], transcript))
3010}
3011
3012pub fn next_nodes_for(
3013    graph: &WorkflowGraph,
3014    current: &str,
3015    branch: Option<&str>,
3016) -> Vec<WorkflowEdge> {
3017    let mut matching: Vec<WorkflowEdge> = graph
3018        .edges
3019        .iter()
3020        .filter(|edge| edge.from == current && edge.branch.as_deref() == branch)
3021        .cloned()
3022        .collect();
3023    if matching.is_empty() {
3024        matching = graph
3025            .edges
3026            .iter()
3027            .filter(|edge| edge.from == current && edge.branch.is_none())
3028            .cloned()
3029            .collect();
3030    }
3031    matching
3032}
3033
3034pub fn next_node_for(graph: &WorkflowGraph, current: &str, branch: &str) -> Option<String> {
3035    next_nodes_for(graph, current, Some(branch))
3036        .into_iter()
3037        .next()
3038        .map(|edge| edge.to)
3039}
3040
3041pub fn append_audit_entry(
3042    graph: &mut WorkflowGraph,
3043    op: &str,
3044    node_id: Option<String>,
3045    reason: Option<String>,
3046    metadata: BTreeMap<String, serde_json::Value>,
3047) {
3048    graph.audit_log.push(WorkflowAuditEntry {
3049        id: new_id("audit"),
3050        op: op.to_string(),
3051        node_id,
3052        timestamp: now_rfc3339(),
3053        reason,
3054        metadata,
3055    });
3056}
3057
3058pub fn builtin_ceiling() -> CapabilityPolicy {
3059    CapabilityPolicy {
3060        // Runtime-owned ceiling is capability-based, not product-tool-based.
3061        // Integrators define concrete tool surfaces in workflow graphs / registries.
3062        tools: Vec::new(),
3063        capabilities: BTreeMap::from([
3064            (
3065                "workspace".to_string(),
3066                vec![
3067                    "read_text".to_string(),
3068                    "write_text".to_string(),
3069                    "apply_edit".to_string(),
3070                    "delete".to_string(),
3071                    "exists".to_string(),
3072                    "list".to_string(),
3073                ],
3074            ),
3075            ("process".to_string(), vec!["exec".to_string()]),
3076        ]),
3077        workspace_roots: Vec::new(),
3078        side_effect_level: Some("network".to_string()),
3079        recursion_limit: Some(8),
3080        tool_arg_constraints: Vec::new(),
3081        tool_metadata: BTreeMap::new(),
3082    }
3083}
3084
3085#[cfg(test)]
3086mod tests {
3087    use super::*;
3088
3089    #[test]
3090    fn capability_intersection_rejects_privilege_expansion() {
3091        let ceiling = CapabilityPolicy {
3092            tools: vec!["read".to_string()],
3093            side_effect_level: Some("read_only".to_string()),
3094            recursion_limit: Some(2),
3095            ..Default::default()
3096        };
3097        let requested = CapabilityPolicy {
3098            tools: vec!["read".to_string(), "edit".to_string()],
3099            ..Default::default()
3100        };
3101        let error = ceiling.intersect(&requested).unwrap_err();
3102        assert!(error.contains("host ceiling"));
3103    }
3104
3105    #[test]
3106    fn mutation_session_normalize_fills_defaults() {
3107        let normalized = MutationSessionRecord::default().normalize();
3108        assert!(normalized.session_id.starts_with("session_"));
3109        assert_eq!(normalized.mutation_scope, "read_only");
3110        assert_eq!(normalized.approval_mode, "host_enforced");
3111    }
3112
3113    #[test]
3114    fn install_current_mutation_session_round_trips() {
3115        install_current_mutation_session(Some(MutationSessionRecord {
3116            session_id: "session_test".to_string(),
3117            mutation_scope: "apply_workspace".to_string(),
3118            approval_mode: "explicit".to_string(),
3119            ..Default::default()
3120        }));
3121        let current = current_mutation_session().expect("session installed");
3122        assert_eq!(current.session_id, "session_test");
3123        assert_eq!(current.mutation_scope, "apply_workspace");
3124        assert_eq!(current.approval_mode, "explicit");
3125
3126        install_current_mutation_session(None);
3127        assert!(current_mutation_session().is_none());
3128    }
3129
3130    #[test]
3131    fn active_execution_policy_rejects_unknown_bridge_builtin() {
3132        push_execution_policy(CapabilityPolicy {
3133            tools: vec!["read".to_string()],
3134            capabilities: BTreeMap::from([(
3135                "workspace".to_string(),
3136                vec!["read_text".to_string()],
3137            )]),
3138            side_effect_level: Some("read_only".to_string()),
3139            recursion_limit: Some(1),
3140            ..Default::default()
3141        });
3142        let error = enforce_current_policy_for_bridge_builtin("custom_host_builtin").unwrap_err();
3143        pop_execution_policy();
3144        assert!(matches!(
3145            error,
3146            VmError::CategorizedError {
3147                category: crate::value::ErrorCategory::ToolRejected,
3148                ..
3149            }
3150        ));
3151    }
3152
3153    #[test]
3154    fn active_execution_policy_rejects_mcp_escape_hatch() {
3155        push_execution_policy(CapabilityPolicy {
3156            tools: vec!["read".to_string()],
3157            capabilities: BTreeMap::from([(
3158                "workspace".to_string(),
3159                vec!["read_text".to_string()],
3160            )]),
3161            side_effect_level: Some("read_only".to_string()),
3162            recursion_limit: Some(1),
3163            ..Default::default()
3164        });
3165        let error = enforce_current_policy_for_builtin("mcp_connect", &[]).unwrap_err();
3166        pop_execution_policy();
3167        assert!(matches!(
3168            error,
3169            VmError::CategorizedError {
3170                category: crate::value::ErrorCategory::ToolRejected,
3171                ..
3172            }
3173        ));
3174    }
3175
3176    #[test]
3177    fn workflow_normalization_upgrades_legacy_act_verify_repair_shape() {
3178        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3179            "name": "legacy",
3180            "act": {"mode": "llm"},
3181            "verify": {"kind": "verify"},
3182            "repair": {"mode": "agent"},
3183        }));
3184        let graph = normalize_workflow_value(&value).unwrap();
3185        assert_eq!(graph.type_name, "workflow_graph");
3186        assert!(graph.nodes.contains_key("act"));
3187        assert!(graph.nodes.contains_key("verify"));
3188        assert!(graph.nodes.contains_key("repair"));
3189        assert_eq!(graph.entry, "act");
3190    }
3191
3192    #[test]
3193    fn workflow_normalization_accepts_tool_registry_nodes() {
3194        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3195            "name": "registry_tools",
3196            "entry": "implement",
3197            "nodes": {
3198                "implement": {
3199                    "kind": "stage",
3200                    "mode": "agent",
3201                    "tools": {
3202                        "_type": "tool_registry",
3203                        "tools": [
3204                            {"name": "read", "description": "Read files"},
3205                            {"name": "run", "description": "Run commands"}
3206                        ]
3207                    }
3208                }
3209            },
3210            "edges": []
3211        }));
3212        let graph = normalize_workflow_value(&value).unwrap();
3213        let node = graph.nodes.get("implement").unwrap();
3214        assert_eq!(workflow_tool_names(&node.tools), vec!["read", "run"]);
3215    }
3216
3217    #[test]
3218    fn artifact_selection_honors_budget_and_priority() {
3219        let policy = ContextPolicy {
3220            max_artifacts: Some(2),
3221            max_tokens: Some(30),
3222            prefer_recent: true,
3223            prefer_fresh: true,
3224            prioritize_kinds: vec!["verification_result".to_string()],
3225            ..Default::default()
3226        };
3227        let artifacts = vec![
3228            ArtifactRecord {
3229                type_name: "artifact".to_string(),
3230                id: "a".to_string(),
3231                kind: "summary".to_string(),
3232                text: Some("short".to_string()),
3233                relevance: Some(0.9),
3234                created_at: now_rfc3339(),
3235                ..Default::default()
3236            }
3237            .normalize(),
3238            ArtifactRecord {
3239                type_name: "artifact".to_string(),
3240                id: "b".to_string(),
3241                kind: "summary".to_string(),
3242                text: Some("this is a much larger artifact body".to_string()),
3243                relevance: Some(1.0),
3244                created_at: now_rfc3339(),
3245                ..Default::default()
3246            }
3247            .normalize(),
3248            ArtifactRecord {
3249                type_name: "artifact".to_string(),
3250                id: "c".to_string(),
3251                kind: "summary".to_string(),
3252                text: Some("tiny".to_string()),
3253                relevance: Some(0.5),
3254                created_at: now_rfc3339(),
3255                ..Default::default()
3256            }
3257            .normalize(),
3258        ];
3259        let selected = select_artifacts(artifacts, &policy);
3260        assert_eq!(selected.len(), 2);
3261        assert!(selected.iter().all(|artifact| artifact.kind == "summary"));
3262    }
3263
3264    #[test]
3265    fn workflow_validation_rejects_condition_without_true_false_edges() {
3266        let graph = WorkflowGraph {
3267            entry: "gate".to_string(),
3268            nodes: BTreeMap::from([(
3269                "gate".to_string(),
3270                WorkflowNode {
3271                    id: Some("gate".to_string()),
3272                    kind: "condition".to_string(),
3273                    ..Default::default()
3274                },
3275            )]),
3276            edges: vec![WorkflowEdge {
3277                from: "gate".to_string(),
3278                to: "next".to_string(),
3279                branch: Some("true".to_string()),
3280                label: None,
3281            }],
3282            ..Default::default()
3283        };
3284        let report = validate_workflow(&graph, None);
3285        assert!(!report.valid);
3286        assert!(report
3287            .errors
3288            .iter()
3289            .any(|error| error.contains("true") && error.contains("false")));
3290    }
3291
3292    #[test]
3293    fn replay_fixture_round_trip_passes() {
3294        let run = RunRecord {
3295            type_name: "run_record".to_string(),
3296            id: "run_1".to_string(),
3297            workflow_id: "wf".to_string(),
3298            workflow_name: Some("demo".to_string()),
3299            task: "demo".to_string(),
3300            status: "completed".to_string(),
3301            started_at: "1".to_string(),
3302            finished_at: Some("2".to_string()),
3303            parent_run_id: None,
3304            root_run_id: Some("run_1".to_string()),
3305            stages: vec![RunStageRecord {
3306                id: "stage_1".to_string(),
3307                node_id: "act".to_string(),
3308                kind: "stage".to_string(),
3309                status: "completed".to_string(),
3310                outcome: "success".to_string(),
3311                branch: Some("success".to_string()),
3312                started_at: "1".to_string(),
3313                finished_at: Some("2".to_string()),
3314                visible_text: Some("done".to_string()),
3315                private_reasoning: None,
3316                transcript: None,
3317                verification: None,
3318                usage: None,
3319                artifacts: vec![ArtifactRecord {
3320                    type_name: "artifact".to_string(),
3321                    id: "a1".to_string(),
3322                    kind: "summary".to_string(),
3323                    text: Some("done".to_string()),
3324                    created_at: "1".to_string(),
3325                    ..Default::default()
3326                }
3327                .normalize()],
3328                consumed_artifact_ids: vec![],
3329                produced_artifact_ids: vec!["a1".to_string()],
3330                attempts: vec![],
3331                metadata: BTreeMap::new(),
3332            }],
3333            transitions: vec![],
3334            checkpoints: vec![],
3335            pending_nodes: vec![],
3336            completed_nodes: vec!["act".to_string()],
3337            child_runs: vec![],
3338            artifacts: vec![],
3339            policy: CapabilityPolicy::default(),
3340            execution: None,
3341            transcript: None,
3342            usage: None,
3343            replay_fixture: None,
3344            trace_spans: vec![],
3345            metadata: BTreeMap::new(),
3346            persisted_path: None,
3347        };
3348        let fixture = replay_fixture_from_run(&run);
3349        let report = evaluate_run_against_fixture(&run, &fixture);
3350        assert!(report.pass);
3351        assert!(report.failures.is_empty());
3352    }
3353
3354    #[test]
3355    fn replay_eval_suite_reports_failed_case() {
3356        let good = RunRecord {
3357            id: "run_good".to_string(),
3358            workflow_id: "wf".to_string(),
3359            status: "completed".to_string(),
3360            stages: vec![RunStageRecord {
3361                node_id: "act".to_string(),
3362                status: "completed".to_string(),
3363                outcome: "success".to_string(),
3364                ..Default::default()
3365            }],
3366            ..Default::default()
3367        };
3368        let bad = RunRecord {
3369            id: "run_bad".to_string(),
3370            workflow_id: "wf".to_string(),
3371            status: "failed".to_string(),
3372            stages: vec![RunStageRecord {
3373                node_id: "act".to_string(),
3374                status: "failed".to_string(),
3375                outcome: "error".to_string(),
3376                ..Default::default()
3377            }],
3378            ..Default::default()
3379        };
3380        let suite = evaluate_run_suite(vec![
3381            (
3382                good.clone(),
3383                replay_fixture_from_run(&good),
3384                Some("good.json".to_string()),
3385            ),
3386            (
3387                bad.clone(),
3388                replay_fixture_from_run(&good),
3389                Some("bad.json".to_string()),
3390            ),
3391        ]);
3392        assert!(!suite.pass);
3393        assert_eq!(suite.total, 2);
3394        assert_eq!(suite.failed, 1);
3395        assert!(suite.cases.iter().any(|case| !case.pass));
3396    }
3397
3398    #[test]
3399    fn run_diff_reports_changed_stage() {
3400        let left = RunRecord {
3401            id: "left".to_string(),
3402            workflow_id: "wf".to_string(),
3403            status: "completed".to_string(),
3404            stages: vec![RunStageRecord {
3405                node_id: "act".to_string(),
3406                status: "completed".to_string(),
3407                outcome: "success".to_string(),
3408                ..Default::default()
3409            }],
3410            ..Default::default()
3411        };
3412        let right = RunRecord {
3413            id: "right".to_string(),
3414            workflow_id: "wf".to_string(),
3415            status: "failed".to_string(),
3416            stages: vec![RunStageRecord {
3417                node_id: "act".to_string(),
3418                status: "failed".to_string(),
3419                outcome: "error".to_string(),
3420                ..Default::default()
3421            }],
3422            ..Default::default()
3423        };
3424        let diff = diff_run_records(&left, &right);
3425        assert!(diff.status_changed);
3426        assert!(!diff.identical);
3427        assert_eq!(diff.stage_diffs.len(), 1);
3428    }
3429
3430    #[test]
3431    fn eval_suite_manifest_can_fail_on_baseline_diff() {
3432        let temp_dir =
3433            std::env::temp_dir().join(format!("harn-eval-suite-{}", uuid::Uuid::now_v7()));
3434        std::fs::create_dir_all(&temp_dir).unwrap();
3435        let baseline_path = temp_dir.join("baseline.json");
3436        let candidate_path = temp_dir.join("candidate.json");
3437
3438        let baseline = RunRecord {
3439            id: "baseline".to_string(),
3440            workflow_id: "wf".to_string(),
3441            status: "completed".to_string(),
3442            stages: vec![RunStageRecord {
3443                node_id: "act".to_string(),
3444                status: "completed".to_string(),
3445                outcome: "success".to_string(),
3446                ..Default::default()
3447            }],
3448            ..Default::default()
3449        };
3450        let candidate = RunRecord {
3451            id: "candidate".to_string(),
3452            workflow_id: "wf".to_string(),
3453            status: "failed".to_string(),
3454            stages: vec![RunStageRecord {
3455                node_id: "act".to_string(),
3456                status: "failed".to_string(),
3457                outcome: "error".to_string(),
3458                ..Default::default()
3459            }],
3460            ..Default::default()
3461        };
3462
3463        save_run_record(&baseline, Some(baseline_path.to_str().unwrap())).unwrap();
3464        save_run_record(&candidate, Some(candidate_path.to_str().unwrap())).unwrap();
3465
3466        let manifest = EvalSuiteManifest {
3467            base_dir: Some(temp_dir.display().to_string()),
3468            cases: vec![EvalSuiteCase {
3469                label: Some("candidate".to_string()),
3470                run_path: "candidate.json".to_string(),
3471                fixture_path: None,
3472                compare_to: Some("baseline.json".to_string()),
3473            }],
3474            ..Default::default()
3475        };
3476        let suite = evaluate_run_suite_manifest(&manifest).unwrap();
3477        assert!(!suite.pass);
3478        assert_eq!(suite.failed, 1);
3479        assert!(suite.cases[0].comparison.is_some());
3480        assert!(suite.cases[0]
3481            .failures
3482            .iter()
3483            .any(|failure| failure.contains("baseline")));
3484    }
3485
3486    #[test]
3487    fn render_unified_diff_marks_removed_and_added_lines() {
3488        let diff = render_unified_diff(Some("src/main.rs"), "old\nsame", "new\nsame");
3489        assert!(diff.contains("--- a/src/main.rs"));
3490        assert!(diff.contains("+++ b/src/main.rs"));
3491        assert!(diff.contains("-old"));
3492        assert!(diff.contains("+new"));
3493        assert!(diff.contains(" same"));
3494    }
3495
3496    #[test]
3497    fn execution_policy_rejects_process_exec_when_read_only() {
3498        push_execution_policy(CapabilityPolicy {
3499            side_effect_level: Some("read_only".to_string()),
3500            capabilities: BTreeMap::from([("process".to_string(), vec!["exec".to_string()])]),
3501            ..Default::default()
3502        });
3503        let result = enforce_current_policy_for_builtin("exec", &[]);
3504        pop_execution_policy();
3505        assert!(result.is_err());
3506    }
3507
3508    #[test]
3509    fn execution_policy_rejects_unlisted_tool() {
3510        push_execution_policy(CapabilityPolicy {
3511            tools: vec!["read".to_string()],
3512            ..Default::default()
3513        });
3514        let result = enforce_current_policy_for_tool("edit");
3515        pop_execution_policy();
3516        assert!(result.is_err());
3517    }
3518
3519    #[test]
3520    fn normalize_run_record_preserves_trace_spans() {
3521        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3522            "_type": "run_record",
3523            "id": "run_trace",
3524            "workflow_id": "wf",
3525            "status": "completed",
3526            "started_at": "1",
3527            "trace_spans": [
3528                {
3529                    "span_id": 1,
3530                    "parent_id": null,
3531                    "kind": "pipeline",
3532                    "name": "workflow",
3533                    "start_ms": 0,
3534                    "duration_ms": 42,
3535                    "metadata": {"model": "demo"}
3536                }
3537            ]
3538        }));
3539
3540        let run = normalize_run_record(&value).unwrap();
3541        assert_eq!(run.trace_spans.len(), 1);
3542        assert_eq!(run.trace_spans[0].kind, "pipeline");
3543        assert_eq!(
3544            run.trace_spans[0].metadata["model"],
3545            serde_json::json!("demo")
3546        );
3547    }
3548
3549    // ── Tool hook tests ──────────────────────────────────────────────
3550
3551    #[test]
3552    fn pre_tool_hook_deny_blocks_execution() {
3553        clear_tool_hooks();
3554        register_tool_hook(ToolHook {
3555            pattern: "dangerous_*".to_string(),
3556            pre: Some(Rc::new(|_name, _args| {
3557                PreToolAction::Deny("blocked by policy".to_string())
3558            })),
3559            post: None,
3560        });
3561        let result = run_pre_tool_hooks("dangerous_delete", &serde_json::json!({}));
3562        clear_tool_hooks();
3563        assert!(matches!(result, PreToolAction::Deny(_)));
3564    }
3565
3566    #[test]
3567    fn pre_tool_hook_allow_passes_through() {
3568        clear_tool_hooks();
3569        register_tool_hook(ToolHook {
3570            pattern: "safe_*".to_string(),
3571            pre: Some(Rc::new(|_name, _args| PreToolAction::Allow)),
3572            post: None,
3573        });
3574        let result = run_pre_tool_hooks("safe_read", &serde_json::json!({}));
3575        clear_tool_hooks();
3576        assert!(matches!(result, PreToolAction::Allow));
3577    }
3578
3579    #[test]
3580    fn pre_tool_hook_modify_rewrites_args() {
3581        clear_tool_hooks();
3582        register_tool_hook(ToolHook {
3583            pattern: "*".to_string(),
3584            pre: Some(Rc::new(|_name, _args| {
3585                PreToolAction::Modify(serde_json::json!({"path": "/sanitized"}))
3586            })),
3587            post: None,
3588        });
3589        let result = run_pre_tool_hooks("read_file", &serde_json::json!({"path": "/etc/passwd"}));
3590        clear_tool_hooks();
3591        match result {
3592            PreToolAction::Modify(args) => assert_eq!(args["path"], "/sanitized"),
3593            _ => panic!("expected Modify"),
3594        }
3595    }
3596
3597    #[test]
3598    fn post_tool_hook_modifies_result() {
3599        clear_tool_hooks();
3600        register_tool_hook(ToolHook {
3601            pattern: "exec".to_string(),
3602            pre: None,
3603            post: Some(Rc::new(|_name, result| {
3604                if result.contains("SECRET") {
3605                    PostToolAction::Modify("[REDACTED]".to_string())
3606                } else {
3607                    PostToolAction::Pass
3608                }
3609            })),
3610        });
3611        let result = run_post_tool_hooks("exec", "output with SECRET data");
3612        let clean = run_post_tool_hooks("exec", "clean output");
3613        clear_tool_hooks();
3614        assert_eq!(result, "[REDACTED]");
3615        assert_eq!(clean, "clean output");
3616    }
3617
3618    #[test]
3619    fn unmatched_hook_pattern_does_not_fire() {
3620        clear_tool_hooks();
3621        register_tool_hook(ToolHook {
3622            pattern: "exec".to_string(),
3623            pre: Some(Rc::new(|_name, _args| {
3624                PreToolAction::Deny("should not match".to_string())
3625            })),
3626            post: None,
3627        });
3628        let result = run_pre_tool_hooks("read_file", &serde_json::json!({}));
3629        clear_tool_hooks();
3630        assert!(matches!(result, PreToolAction::Allow));
3631    }
3632
3633    #[test]
3634    fn glob_match_patterns() {
3635        assert!(glob_match("*", "anything"));
3636        assert!(glob_match("exec*", "exec_at"));
3637        assert!(glob_match("*_file", "read_file"));
3638        assert!(!glob_match("exec*", "read_file"));
3639        assert!(glob_match("read_file", "read_file"));
3640        assert!(!glob_match("read_file", "write_file"));
3641    }
3642
3643    // ── Auto-compaction tests ────────────────────────────────────────
3644
3645    #[test]
3646    fn microcompact_snips_large_output() {
3647        let large = "x".repeat(50_000);
3648        let result = microcompact_tool_output(&large, 10_000);
3649        assert!(result.len() < 15_000);
3650        assert!(result.contains("snipped"));
3651    }
3652
3653    #[test]
3654    fn microcompact_preserves_small_output() {
3655        let small = "hello world";
3656        let result = microcompact_tool_output(small, 10_000);
3657        assert_eq!(result, small);
3658    }
3659
3660    #[test]
3661    fn auto_compact_messages_reduces_count() {
3662        let mut messages: Vec<serde_json::Value> = (0..20)
3663            .map(|i| serde_json::json!({"role": "user", "content": format!("message {i}")}))
3664            .collect();
3665        let runtime = tokio::runtime::Builder::new_current_thread()
3666            .enable_all()
3667            .build()
3668            .unwrap();
3669        let compacted = runtime.block_on(auto_compact_messages(
3670            &mut messages,
3671            &AutoCompactConfig {
3672                compact_strategy: CompactStrategy::Truncate,
3673                keep_last: 6,
3674                ..Default::default()
3675            },
3676            None,
3677        ));
3678        let summary = compacted.unwrap();
3679        assert!(summary.is_some());
3680        assert!(messages.len() <= 7); // 6 kept + 1 summary
3681        assert!(messages[0]["content"]
3682            .as_str()
3683            .unwrap()
3684            .contains("auto-compacted"));
3685    }
3686
3687    #[test]
3688    fn auto_compact_noop_when_under_threshold() {
3689        let mut messages: Vec<serde_json::Value> = (0..4)
3690            .map(|i| serde_json::json!({"role": "user", "content": format!("msg {i}")}))
3691            .collect();
3692        let runtime = tokio::runtime::Builder::new_current_thread()
3693            .enable_all()
3694            .build()
3695            .unwrap();
3696        let compacted = runtime.block_on(auto_compact_messages(
3697            &mut messages,
3698            &AutoCompactConfig {
3699                compact_strategy: CompactStrategy::Truncate,
3700                keep_last: 6,
3701                ..Default::default()
3702            },
3703            None,
3704        ));
3705        assert!(compacted.unwrap().is_none());
3706        assert_eq!(messages.len(), 4);
3707    }
3708
3709    #[test]
3710    fn estimate_message_tokens_basic() {
3711        let messages = vec![
3712            serde_json::json!({"role": "user", "content": "a".repeat(400)}),
3713            serde_json::json!({"role": "assistant", "content": "b".repeat(400)}),
3714        ];
3715        let tokens = estimate_message_tokens(&messages);
3716        assert_eq!(tokens, 200); // 800 chars / 4
3717    }
3718
3719    // ── Artifact dedup and microcompaction tests ─────────────────────
3720
3721    #[test]
3722    fn dedup_artifacts_removes_duplicates() {
3723        let mut artifacts = vec![
3724            ArtifactRecord {
3725                id: "a1".to_string(),
3726                kind: "test".to_string(),
3727                text: Some("duplicate content".to_string()),
3728                ..Default::default()
3729            },
3730            ArtifactRecord {
3731                id: "a2".to_string(),
3732                kind: "test".to_string(),
3733                text: Some("duplicate content".to_string()),
3734                ..Default::default()
3735            },
3736            ArtifactRecord {
3737                id: "a3".to_string(),
3738                kind: "test".to_string(),
3739                text: Some("unique content".to_string()),
3740                ..Default::default()
3741            },
3742        ];
3743        dedup_artifacts(&mut artifacts);
3744        assert_eq!(artifacts.len(), 2);
3745    }
3746
3747    #[test]
3748    fn microcompact_artifact_snips_oversized() {
3749        let mut artifact = ArtifactRecord {
3750            id: "a1".to_string(),
3751            kind: "test".to_string(),
3752            text: Some("x".repeat(10_000)),
3753            estimated_tokens: Some(2_500),
3754            ..Default::default()
3755        };
3756        microcompact_artifact(&mut artifact, 500);
3757        assert!(artifact.text.as_ref().unwrap().len() < 5_000);
3758        assert_eq!(artifact.estimated_tokens, Some(500));
3759    }
3760
3761    // ── Tool argument constraint tests ───────────────────────────────
3762
3763    #[test]
3764    fn arg_constraint_allows_matching_pattern() {
3765        let policy = CapabilityPolicy {
3766            tool_arg_constraints: vec![ToolArgConstraint {
3767                tool: "exec".to_string(),
3768                arg_patterns: vec!["cargo *".to_string()],
3769            }],
3770            ..Default::default()
3771        };
3772        let result = enforce_tool_arg_constraints(
3773            &policy,
3774            "exec",
3775            &serde_json::json!({"command": "cargo test"}),
3776        );
3777        assert!(result.is_ok());
3778    }
3779
3780    #[test]
3781    fn arg_constraint_rejects_non_matching_pattern() {
3782        let policy = CapabilityPolicy {
3783            tool_arg_constraints: vec![ToolArgConstraint {
3784                tool: "exec".to_string(),
3785                arg_patterns: vec!["cargo *".to_string()],
3786            }],
3787            ..Default::default()
3788        };
3789        let result = enforce_tool_arg_constraints(
3790            &policy,
3791            "exec",
3792            &serde_json::json!({"command": "rm -rf /"}),
3793        );
3794        assert!(result.is_err());
3795    }
3796
3797    #[test]
3798    fn arg_constraint_ignores_unmatched_tool() {
3799        let policy = CapabilityPolicy {
3800            tool_arg_constraints: vec![ToolArgConstraint {
3801                tool: "exec".to_string(),
3802                arg_patterns: vec!["cargo *".to_string()],
3803            }],
3804            ..Default::default()
3805        };
3806        let result = enforce_tool_arg_constraints(
3807            &policy,
3808            "read_file",
3809            &serde_json::json!({"path": "/etc/passwd"}),
3810        );
3811        assert!(result.is_ok());
3812    }
3813
3814    #[test]
3815    fn microcompact_handles_multibyte_utf8() {
3816        // Emoji are 4 bytes each — slicing at arbitrary byte offsets would panic
3817        let emoji_output = "🔥".repeat(500); // 2000 bytes, 500 chars
3818        let result = microcompact_tool_output(&emoji_output, 400);
3819        // Should not panic and should contain the snip marker
3820        assert!(result.contains("snipped"));
3821
3822        // Mixed ASCII + multi-byte
3823        let mixed = format!("{}{}{}", "a".repeat(300), "é".repeat(500), "b".repeat(300));
3824        let result2 = microcompact_tool_output(&mixed, 400);
3825        assert!(result2.contains("snipped"));
3826
3827        // CJK characters (3 bytes each)
3828        let cjk = "中文".repeat(500);
3829        let result3 = microcompact_tool_output(&cjk, 400);
3830        assert!(result3.contains("snipped"));
3831    }
3832}