harn_vm/
orchestration.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3use std::rc::Rc;
4use std::{cell::RefCell, thread_local};
5
6use serde::{Deserialize, Serialize};
7
8use crate::llm::{extract_llm_options, vm_call_llm_full, vm_value_to_json};
9use crate::value::{VmError, VmValue};
10
11fn now_rfc3339() -> String {
12    use std::time::{SystemTime, UNIX_EPOCH};
13    let ts = SystemTime::now()
14        .duration_since(UNIX_EPOCH)
15        .unwrap_or_default()
16        .as_secs();
17    format!("{ts}")
18}
19
20fn new_id(prefix: &str) -> String {
21    format!("{prefix}_{}", uuid::Uuid::now_v7())
22}
23
24fn default_run_dir() -> PathBuf {
25    std::env::var("HARN_RUN_DIR")
26        .map(PathBuf::from)
27        .unwrap_or_else(|_| PathBuf::from(".harn-runs"))
28}
29
30thread_local! {
31    static EXECUTION_POLICY_STACK: RefCell<Vec<CapabilityPolicy>> = const { RefCell::new(Vec::new()) };
32    static TOOL_HOOKS: RefCell<Vec<ToolHook>> = const { RefCell::new(Vec::new()) };
33}
34
35// ── Tool lifecycle hooks ──────────────────────────────────────────────
36
37/// Action returned by a PreToolUse hook.
38#[derive(Clone, Debug)]
39pub enum PreToolAction {
40    /// Allow the tool call to proceed unchanged.
41    Allow,
42    /// Deny the tool call with an explanation.
43    Deny(String),
44    /// Allow but replace the arguments.
45    Modify(serde_json::Value),
46}
47
48/// Action returned by a PostToolUse hook.
49#[derive(Clone, Debug)]
50pub enum PostToolAction {
51    /// Pass the result through unchanged.
52    Pass,
53    /// Replace the result text.
54    Modify(String),
55}
56
57/// Callback types for tool lifecycle hooks.
58pub type PreToolHookFn = Rc<dyn Fn(&str, &serde_json::Value) -> PreToolAction>;
59pub type PostToolHookFn = Rc<dyn Fn(&str, &str) -> PostToolAction>;
60
61/// A registered tool hook with a name pattern and callbacks.
62#[derive(Clone)]
63pub struct ToolHook {
64    /// Glob-style pattern matched against tool names (e.g. `"*"`, `"exec*"`, `"read_file"`).
65    pub pattern: String,
66    /// Called before tool execution. Return `Deny` to reject, `Modify` to rewrite args.
67    pub pre: Option<PreToolHookFn>,
68    /// Called after tool execution with the result text. Return `Modify` to rewrite.
69    pub post: Option<PostToolHookFn>,
70}
71
72impl std::fmt::Debug for ToolHook {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        f.debug_struct("ToolHook")
75            .field("pattern", &self.pattern)
76            .field("has_pre", &self.pre.is_some())
77            .field("has_post", &self.post.is_some())
78            .finish()
79    }
80}
81
82fn glob_match(pattern: &str, name: &str) -> bool {
83    if pattern == "*" {
84        return true;
85    }
86    if let Some(prefix) = pattern.strip_suffix('*') {
87        return name.starts_with(prefix);
88    }
89    if let Some(suffix) = pattern.strip_prefix('*') {
90        return name.ends_with(suffix);
91    }
92    pattern == name
93}
94
95pub fn register_tool_hook(hook: ToolHook) {
96    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().push(hook));
97}
98
99pub fn clear_tool_hooks() {
100    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().clear());
101}
102
103/// Run all matching PreToolUse hooks. Returns the final action.
104pub fn run_pre_tool_hooks(tool_name: &str, args: &serde_json::Value) -> PreToolAction {
105    TOOL_HOOKS.with(|hooks| {
106        let hooks = hooks.borrow();
107        let mut current_args = args.clone();
108        for hook in hooks.iter() {
109            if !glob_match(&hook.pattern, tool_name) {
110                continue;
111            }
112            if let Some(ref pre) = hook.pre {
113                match pre(tool_name, &current_args) {
114                    PreToolAction::Allow => {}
115                    PreToolAction::Deny(reason) => return PreToolAction::Deny(reason),
116                    PreToolAction::Modify(new_args) => {
117                        current_args = new_args;
118                    }
119                }
120            }
121        }
122        if current_args != *args {
123            PreToolAction::Modify(current_args)
124        } else {
125            PreToolAction::Allow
126        }
127    })
128}
129
130/// Run all matching PostToolUse hooks. Returns the (possibly modified) result.
131pub fn run_post_tool_hooks(tool_name: &str, result: &str) -> String {
132    TOOL_HOOKS.with(|hooks| {
133        let hooks = hooks.borrow();
134        let mut current = result.to_string();
135        for hook in hooks.iter() {
136            if !glob_match(&hook.pattern, tool_name) {
137                continue;
138            }
139            if let Some(ref post) = hook.post {
140                match post(tool_name, &current) {
141                    PostToolAction::Pass => {}
142                    PostToolAction::Modify(new_result) => {
143                        current = new_result;
144                    }
145                }
146            }
147        }
148        current
149    })
150}
151
152// ── Auto-compaction ───────────────────────────────────────────────────
153
154/// Configuration for automatic transcript compaction in agent loops.
155#[derive(Clone, Debug)]
156pub struct AutoCompactConfig {
157    /// Maximum estimated tokens before triggering auto-compaction.
158    pub token_threshold: usize,
159    /// Maximum character length for a single tool result before microcompaction.
160    pub tool_output_max_chars: usize,
161    /// Number of recent messages to keep during auto-compaction.
162    pub keep_last: usize,
163}
164
165impl Default for AutoCompactConfig {
166    fn default() -> Self {
167        Self {
168            token_threshold: 80_000,
169            tool_output_max_chars: 20_000,
170            keep_last: 8,
171        }
172    }
173}
174
175/// Estimate token count from a list of JSON messages (chars / 4 heuristic).
176pub fn estimate_message_tokens(messages: &[serde_json::Value]) -> usize {
177    messages
178        .iter()
179        .map(|m| {
180            m.get("content")
181                .and_then(|c| c.as_str())
182                .map(|s| s.len())
183                .unwrap_or(0)
184        })
185        .sum::<usize>()
186        / 4
187}
188
189/// Microcompact a tool result: if it exceeds `max_chars`, keep the first and
190/// last portions with a snip marker in between.
191pub fn microcompact_tool_output(output: &str, max_chars: usize) -> String {
192    if output.len() <= max_chars || max_chars < 200 {
193        return output.to_string();
194    }
195    let keep = max_chars / 2;
196    let head = &output[..keep];
197    let tail = &output[output.len() - keep..];
198    let snipped = output.len() - max_chars;
199    format!("{head}\n\n[... {snipped} characters snipped ...]\n\n{tail}")
200}
201
202/// Auto-compact a message list in place: summarize older messages into a
203/// system note and keep the most recent `keep_last` messages.
204pub fn auto_compact_messages(messages: &mut Vec<serde_json::Value>, keep_last: usize) -> bool {
205    if messages.len() <= keep_last {
206        return false;
207    }
208    let split_at = messages.len().saturating_sub(keep_last);
209    let old_messages: Vec<_> = messages.drain(..split_at).collect();
210    let archived_count = old_messages.len();
211    // Build a useful summary: keep tool names, key decisions, and truncate
212    // long content. Use 500 chars per entry (not 200) for better context.
213    let summary_parts: Vec<String> = old_messages
214        .iter()
215        .filter_map(|m| {
216            let role = m.get("role")?.as_str()?;
217            let content = m.get("content")?.as_str()?;
218            if content.is_empty() {
219                return None;
220            }
221            let truncated = if content.len() > 500 {
222                format!("{}...", &content[..500])
223            } else {
224                content.to_string()
225            };
226            Some(format!("[{role}] {truncated}"))
227        })
228        .take(15)
229        .collect();
230    let summary = format!(
231        "[auto-compacted {archived_count} older messages]\n{}{}",
232        summary_parts.join("\n"),
233        if archived_count > 15 {
234            format!("\n... and {} more", archived_count - 15)
235        } else {
236            String::new()
237        }
238    );
239    messages.insert(
240        0,
241        serde_json::json!({
242            "role": "user",
243            "content": summary,
244        }),
245    );
246    true
247}
248
249// ── Adaptive context assembly ─────────────────────────────────────────
250
251/// Snip an artifact's text to fit within a token budget.
252pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
253    let max_chars = max_tokens * 4;
254    if let Some(ref text) = artifact.text {
255        if text.len() > max_chars && max_chars >= 200 {
256            artifact.text = Some(microcompact_tool_output(text, max_chars));
257            artifact.estimated_tokens = Some(max_tokens);
258        }
259    }
260}
261
262/// Deduplicate artifacts by removing those with identical text content,
263/// keeping the one with higher priority.
264pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
265    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
266    artifacts.retain(|artifact| {
267        let text = artifact.text.as_deref().unwrap_or("");
268        if text.is_empty() {
269            return true;
270        }
271        // Simple hash for dedup
272        let hash = {
273            use std::hash::{Hash, Hasher};
274            let mut hasher = std::collections::hash_map::DefaultHasher::new();
275            text.hash(&mut hasher);
276            hasher.finish()
277        };
278        seen_hashes.insert(hash)
279    });
280}
281
282/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
283/// then delegate to the standard `select_artifacts`.
284pub fn select_artifacts_adaptive(
285    mut artifacts: Vec<ArtifactRecord>,
286    policy: &ContextPolicy,
287) -> Vec<ArtifactRecord> {
288    // Phase 1: deduplicate
289    dedup_artifacts(&mut artifacts);
290
291    // Phase 2: microcompact oversized artifacts relative to budget.
292    // Cap individual artifacts to a fraction of the total budget, but don't
293    // let the per-artifact cap exceed the total budget (avoid overrun).
294    if let Some(max_tokens) = policy.max_tokens {
295        let count = artifacts.len().max(1);
296        let per_artifact_budget = max_tokens / count;
297        // Floor of 500 tokens, but never more than total budget
298        let cap = per_artifact_budget.max(500).min(max_tokens);
299        for artifact in &mut artifacts {
300            let est = artifact.estimated_tokens.unwrap_or(0);
301            if est > cap * 2 {
302                microcompact_artifact(artifact, cap);
303            }
304        }
305    }
306
307    // Phase 3: standard selection with budget
308    select_artifacts(artifacts, policy)
309}
310
311// ── Per-agent policy with argument patterns ───────────────────────────
312
313/// Extended policy that supports argument-level constraints.
314#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
315#[serde(default)]
316pub struct ToolArgConstraint {
317    /// Tool name to constrain.
318    pub tool: String,
319    /// Glob patterns that the first string argument must match.
320    /// If empty, no argument constraint is applied.
321    pub arg_patterns: Vec<String>,
322}
323
324/// Check if a tool call satisfies argument constraints in the policy.
325pub fn enforce_tool_arg_constraints(
326    policy: &CapabilityPolicy,
327    tool_name: &str,
328    args: &serde_json::Value,
329) -> Result<(), VmError> {
330    for constraint in &policy.tool_arg_constraints {
331        if !glob_match(&constraint.tool, tool_name) {
332            continue;
333        }
334        if constraint.arg_patterns.is_empty() {
335            continue;
336        }
337        // Extract the first string-like argument for pattern matching
338        let first_arg = args
339            .as_object()
340            .and_then(|o| o.values().next())
341            .and_then(|v| v.as_str())
342            .or_else(|| args.as_str())
343            .unwrap_or("");
344        let matches = constraint
345            .arg_patterns
346            .iter()
347            .any(|pattern| glob_match(pattern, first_arg));
348        if !matches {
349            return reject_policy(format!(
350                "tool '{tool_name}' argument '{first_arg}' does not match allowed patterns: {:?}",
351                constraint.arg_patterns
352            ));
353        }
354    }
355    Ok(())
356}
357
358fn normalize_artifact_kind(kind: &str) -> String {
359    match kind {
360        "resource"
361        | "workspace_file"
362        | "editor_selection"
363        | "workspace_snapshot"
364        | "transcript_summary"
365        | "summary"
366        | "plan"
367        | "diff"
368        | "git_diff"
369        | "patch"
370        | "patch_set"
371        | "patch_proposal"
372        | "diff_review"
373        | "review_decision"
374        | "verification_bundle"
375        | "apply_intent"
376        | "verification_result"
377        | "test_result"
378        | "command_result"
379        | "provider_payload"
380        | "worker_result"
381        | "worker_notification"
382        | "artifact" => kind.to_string(),
383        "file" => "workspace_file".to_string(),
384        "transcript" => "transcript_summary".to_string(),
385        "verification" => "verification_result".to_string(),
386        "test" => "test_result".to_string(),
387        other if other.trim().is_empty() => "artifact".to_string(),
388        other => other.to_string(),
389    }
390}
391
392fn default_artifact_priority(kind: &str) -> i64 {
393    match kind {
394        "verification_result" | "test_result" => 100,
395        "verification_bundle" => 95,
396        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
397        | "review_decision" | "apply_intent" => 90,
398        "plan" => 80,
399        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
400        "summary" | "transcript_summary" => 60,
401        "command_result" => 50,
402        _ => 40,
403    }
404}
405
406fn freshness_rank(value: Option<&str>) -> i64 {
407    match value.unwrap_or_default() {
408        "fresh" | "live" => 3,
409        "recent" => 2,
410        "stale" => 0,
411        _ => 1,
412    }
413}
414
415#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
416#[serde(default)]
417pub struct CapabilityPolicy {
418    pub tools: Vec<String>,
419    pub capabilities: BTreeMap<String, Vec<String>>,
420    pub workspace_roots: Vec<String>,
421    pub side_effect_level: Option<String>,
422    pub recursion_limit: Option<usize>,
423    /// Argument-level constraints for specific tools.
424    #[serde(default)]
425    pub tool_arg_constraints: Vec<ToolArgConstraint>,
426}
427
428impl CapabilityPolicy {
429    pub fn intersect(&self, requested: &CapabilityPolicy) -> Result<CapabilityPolicy, String> {
430        let side_effect_level = match (&self.side_effect_level, &requested.side_effect_level) {
431            (Some(a), Some(b)) => Some(min_side_effect(a, b).to_string()),
432            (Some(a), None) => Some(a.clone()),
433            (None, Some(b)) => Some(b.clone()),
434            (None, None) => None,
435        };
436
437        if !self.tools.is_empty() {
438            let denied: Vec<String> = requested
439                .tools
440                .iter()
441                .filter(|tool| !self.tools.contains(*tool))
442                .cloned()
443                .collect();
444            if !denied.is_empty() {
445                return Err(format!(
446                    "requested tools exceed host ceiling: {}",
447                    denied.join(", ")
448                ));
449            }
450        }
451
452        for (capability, requested_ops) in &requested.capabilities {
453            if let Some(allowed_ops) = self.capabilities.get(capability) {
454                let denied: Vec<String> = requested_ops
455                    .iter()
456                    .filter(|op| !allowed_ops.contains(*op))
457                    .cloned()
458                    .collect();
459                if !denied.is_empty() {
460                    return Err(format!(
461                        "requested capability operations exceed host ceiling: {}.{}",
462                        capability,
463                        denied.join(",")
464                    ));
465                }
466            } else if !self.capabilities.is_empty() {
467                return Err(format!(
468                    "requested capability exceeds host ceiling: {capability}"
469                ));
470            }
471        }
472
473        let tools = if self.tools.is_empty() {
474            requested.tools.clone()
475        } else if requested.tools.is_empty() {
476            self.tools.clone()
477        } else {
478            requested
479                .tools
480                .iter()
481                .filter(|tool| self.tools.contains(*tool))
482                .cloned()
483                .collect()
484        };
485
486        let capabilities = if self.capabilities.is_empty() {
487            requested.capabilities.clone()
488        } else if requested.capabilities.is_empty() {
489            self.capabilities.clone()
490        } else {
491            requested
492                .capabilities
493                .iter()
494                .filter_map(|(capability, requested_ops)| {
495                    self.capabilities.get(capability).map(|allowed_ops| {
496                        (
497                            capability.clone(),
498                            requested_ops
499                                .iter()
500                                .filter(|op| allowed_ops.contains(*op))
501                                .cloned()
502                                .collect::<Vec<_>>(),
503                        )
504                    })
505                })
506                .collect()
507        };
508
509        let workspace_roots = if self.workspace_roots.is_empty() {
510            requested.workspace_roots.clone()
511        } else if requested.workspace_roots.is_empty() {
512            self.workspace_roots.clone()
513        } else {
514            requested
515                .workspace_roots
516                .iter()
517                .filter(|root| self.workspace_roots.contains(*root))
518                .cloned()
519                .collect()
520        };
521
522        let recursion_limit = match (self.recursion_limit, requested.recursion_limit) {
523            (Some(a), Some(b)) => Some(a.min(b)),
524            (Some(a), None) => Some(a),
525            (None, Some(b)) => Some(b),
526            (None, None) => None,
527        };
528
529        // Merge arg constraints from both sides
530        let mut tool_arg_constraints = self.tool_arg_constraints.clone();
531        tool_arg_constraints.extend(requested.tool_arg_constraints.clone());
532
533        Ok(CapabilityPolicy {
534            tools,
535            capabilities,
536            workspace_roots,
537            side_effect_level,
538            recursion_limit,
539            tool_arg_constraints,
540        })
541    }
542}
543
544fn min_side_effect<'a>(a: &'a str, b: &'a str) -> &'a str {
545    fn rank(v: &str) -> usize {
546        match v {
547            "none" => 0,
548            "read_only" => 1,
549            "workspace_write" => 2,
550            "process_exec" => 3,
551            "network" => 4,
552            _ => 5,
553        }
554    }
555    if rank(a) <= rank(b) {
556        a
557    } else {
558        b
559    }
560}
561
562#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
563#[serde(default)]
564pub struct ModelPolicy {
565    pub provider: Option<String>,
566    pub model: Option<String>,
567    pub model_tier: Option<String>,
568    pub temperature: Option<f64>,
569    pub max_tokens: Option<i64>,
570}
571
572#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
573#[serde(default)]
574pub struct TranscriptPolicy {
575    pub mode: Option<String>,
576    pub visibility: Option<String>,
577    pub summarize: bool,
578    pub compact: bool,
579    pub keep_last: Option<usize>,
580}
581
582#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
583#[serde(default)]
584pub struct ContextPolicy {
585    pub max_artifacts: Option<usize>,
586    pub max_tokens: Option<usize>,
587    pub reserve_tokens: Option<usize>,
588    pub include_kinds: Vec<String>,
589    pub exclude_kinds: Vec<String>,
590    pub prioritize_kinds: Vec<String>,
591    pub pinned_ids: Vec<String>,
592    pub include_stages: Vec<String>,
593    pub prefer_recent: bool,
594    pub prefer_fresh: bool,
595    pub render: Option<String>,
596}
597
598#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
599#[serde(default)]
600pub struct RetryPolicy {
601    pub max_attempts: usize,
602    pub verify: bool,
603    pub repair: bool,
604}
605
606#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
607#[serde(default)]
608pub struct StageContract {
609    pub input_kinds: Vec<String>,
610    pub output_kinds: Vec<String>,
611    pub min_inputs: Option<usize>,
612    pub max_inputs: Option<usize>,
613    pub require_transcript: bool,
614    pub schema: Option<serde_json::Value>,
615}
616
617#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
618#[serde(default)]
619pub struct BranchSemantics {
620    pub success: Option<String>,
621    pub failure: Option<String>,
622    pub verify_pass: Option<String>,
623    pub verify_fail: Option<String>,
624    pub condition_true: Option<String>,
625    pub condition_false: Option<String>,
626    pub loop_continue: Option<String>,
627    pub loop_exit: Option<String>,
628    pub escalation: Option<String>,
629}
630
631#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
632#[serde(default)]
633pub struct MapPolicy {
634    pub items: Vec<serde_json::Value>,
635    pub item_artifact_kind: Option<String>,
636    pub output_kind: Option<String>,
637    pub max_items: Option<usize>,
638}
639
640#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
641#[serde(default)]
642pub struct JoinPolicy {
643    pub strategy: String,
644    pub require_all_inputs: bool,
645    pub min_completed: Option<usize>,
646}
647
648#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
649#[serde(default)]
650pub struct ReducePolicy {
651    pub strategy: String,
652    pub separator: Option<String>,
653    pub output_kind: Option<String>,
654}
655
656#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
657#[serde(default)]
658pub struct EscalationPolicy {
659    pub level: Option<String>,
660    pub queue: Option<String>,
661    pub reason: Option<String>,
662}
663
664#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
665#[serde(default)]
666pub struct ArtifactRecord {
667    #[serde(rename = "_type")]
668    pub type_name: String,
669    pub id: String,
670    pub kind: String,
671    pub title: Option<String>,
672    pub text: Option<String>,
673    pub data: Option<serde_json::Value>,
674    pub source: Option<String>,
675    pub created_at: String,
676    pub freshness: Option<String>,
677    pub priority: Option<i64>,
678    pub lineage: Vec<String>,
679    pub relevance: Option<f64>,
680    pub estimated_tokens: Option<usize>,
681    pub stage: Option<String>,
682    pub metadata: BTreeMap<String, serde_json::Value>,
683}
684
685impl ArtifactRecord {
686    pub fn normalize(mut self) -> Self {
687        if self.type_name.is_empty() {
688            self.type_name = "artifact".to_string();
689        }
690        if self.id.is_empty() {
691            self.id = new_id("artifact");
692        }
693        if self.created_at.is_empty() {
694            self.created_at = now_rfc3339();
695        }
696        if self.kind.is_empty() {
697            self.kind = "artifact".to_string();
698        }
699        self.kind = normalize_artifact_kind(&self.kind);
700        if self.estimated_tokens.is_none() {
701            self.estimated_tokens = self
702                .text
703                .as_ref()
704                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
705        }
706        if self.priority.is_none() {
707            self.priority = Some(default_artifact_priority(&self.kind));
708        }
709        self
710    }
711}
712
713#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
714#[serde(default)]
715pub struct WorkflowNode {
716    pub id: Option<String>,
717    pub kind: String,
718    pub mode: Option<String>,
719    pub prompt: Option<String>,
720    pub system: Option<String>,
721    pub task_label: Option<String>,
722    pub tools: Vec<String>,
723    pub model_policy: ModelPolicy,
724    pub transcript_policy: TranscriptPolicy,
725    pub context_policy: ContextPolicy,
726    pub retry_policy: RetryPolicy,
727    pub capability_policy: CapabilityPolicy,
728    pub input_contract: StageContract,
729    pub output_contract: StageContract,
730    pub branch_semantics: BranchSemantics,
731    pub map_policy: MapPolicy,
732    pub join_policy: JoinPolicy,
733    pub reduce_policy: ReducePolicy,
734    pub escalation_policy: EscalationPolicy,
735    pub verify: Option<serde_json::Value>,
736    pub metadata: BTreeMap<String, serde_json::Value>,
737}
738
739#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
740#[serde(default)]
741pub struct WorkflowEdge {
742    pub from: String,
743    pub to: String,
744    pub branch: Option<String>,
745    pub label: Option<String>,
746}
747
748#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
749#[serde(default)]
750pub struct WorkflowGraph {
751    #[serde(rename = "_type")]
752    pub type_name: String,
753    pub id: String,
754    pub name: Option<String>,
755    pub version: usize,
756    pub entry: String,
757    pub nodes: BTreeMap<String, WorkflowNode>,
758    pub edges: Vec<WorkflowEdge>,
759    pub capability_policy: CapabilityPolicy,
760    pub metadata: BTreeMap<String, serde_json::Value>,
761    pub audit_log: Vec<WorkflowAuditEntry>,
762}
763
764#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
765#[serde(default)]
766pub struct WorkflowAuditEntry {
767    pub id: String,
768    pub op: String,
769    pub node_id: Option<String>,
770    pub timestamp: String,
771    pub reason: Option<String>,
772    pub metadata: BTreeMap<String, serde_json::Value>,
773}
774
775#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
776#[serde(default)]
777pub struct RunStageRecord {
778    pub id: String,
779    pub node_id: String,
780    pub kind: String,
781    pub status: String,
782    pub outcome: String,
783    pub branch: Option<String>,
784    pub started_at: String,
785    pub finished_at: Option<String>,
786    pub visible_text: Option<String>,
787    pub private_reasoning: Option<String>,
788    pub transcript: Option<serde_json::Value>,
789    pub verification: Option<serde_json::Value>,
790    pub artifacts: Vec<ArtifactRecord>,
791    pub consumed_artifact_ids: Vec<String>,
792    pub produced_artifact_ids: Vec<String>,
793    pub attempts: Vec<RunStageAttemptRecord>,
794    pub metadata: BTreeMap<String, serde_json::Value>,
795}
796
797#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
798#[serde(default)]
799pub struct RunStageAttemptRecord {
800    pub attempt: usize,
801    pub status: String,
802    pub outcome: String,
803    pub branch: Option<String>,
804    pub error: Option<String>,
805    pub verification: Option<serde_json::Value>,
806    pub started_at: String,
807    pub finished_at: Option<String>,
808}
809
810#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
811#[serde(default)]
812pub struct RunTransitionRecord {
813    pub id: String,
814    pub from_stage_id: Option<String>,
815    pub from_node_id: Option<String>,
816    pub to_node_id: String,
817    pub branch: Option<String>,
818    pub timestamp: String,
819    pub consumed_artifact_ids: Vec<String>,
820    pub produced_artifact_ids: Vec<String>,
821}
822
823#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
824#[serde(default)]
825pub struct RunCheckpointRecord {
826    pub id: String,
827    pub ready_nodes: Vec<String>,
828    pub completed_nodes: Vec<String>,
829    pub last_stage_id: Option<String>,
830    pub persisted_at: String,
831    pub reason: String,
832}
833
834#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
835#[serde(default)]
836pub struct ReplayFixture {
837    #[serde(rename = "_type")]
838    pub type_name: String,
839    pub id: String,
840    pub source_run_id: String,
841    pub workflow_id: String,
842    pub workflow_name: Option<String>,
843    pub created_at: String,
844    pub expected_status: String,
845    pub stage_assertions: Vec<ReplayStageAssertion>,
846}
847
848#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
849#[serde(default)]
850pub struct ReplayStageAssertion {
851    pub node_id: String,
852    pub expected_status: String,
853    pub expected_outcome: String,
854    pub expected_branch: Option<String>,
855    pub required_artifact_kinds: Vec<String>,
856    pub visible_text_contains: Option<String>,
857}
858
859#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
860#[serde(default)]
861pub struct ReplayEvalReport {
862    pub pass: bool,
863    pub failures: Vec<String>,
864    pub stage_count: usize,
865}
866
867#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
868#[serde(default)]
869pub struct ReplayEvalCaseReport {
870    pub run_id: String,
871    pub workflow_id: String,
872    pub label: Option<String>,
873    pub pass: bool,
874    pub failures: Vec<String>,
875    pub stage_count: usize,
876    pub source_path: Option<String>,
877    pub comparison: Option<RunDiffReport>,
878}
879
880#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
881#[serde(default)]
882pub struct ReplayEvalSuiteReport {
883    pub pass: bool,
884    pub total: usize,
885    pub passed: usize,
886    pub failed: usize,
887    pub cases: Vec<ReplayEvalCaseReport>,
888}
889
890#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
891#[serde(default)]
892pub struct RunStageDiffRecord {
893    pub node_id: String,
894    pub change: String,
895    pub details: Vec<String>,
896}
897
898#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
899#[serde(default)]
900pub struct RunDiffReport {
901    pub left_run_id: String,
902    pub right_run_id: String,
903    pub identical: bool,
904    pub status_changed: bool,
905    pub left_status: String,
906    pub right_status: String,
907    pub stage_diffs: Vec<RunStageDiffRecord>,
908    pub transition_count_delta: isize,
909    pub artifact_count_delta: isize,
910    pub checkpoint_count_delta: isize,
911}
912
913#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
914#[serde(default)]
915pub struct EvalSuiteManifest {
916    #[serde(rename = "_type")]
917    pub type_name: String,
918    pub id: String,
919    pub name: Option<String>,
920    pub base_dir: Option<String>,
921    pub cases: Vec<EvalSuiteCase>,
922}
923
924#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
925#[serde(default)]
926pub struct EvalSuiteCase {
927    pub label: Option<String>,
928    pub run_path: String,
929    pub fixture_path: Option<String>,
930    pub compare_to: Option<String>,
931}
932
933#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
934#[serde(default)]
935pub struct RunRecord {
936    #[serde(rename = "_type")]
937    pub type_name: String,
938    pub id: String,
939    pub workflow_id: String,
940    pub workflow_name: Option<String>,
941    pub task: String,
942    pub status: String,
943    pub started_at: String,
944    pub finished_at: Option<String>,
945    pub parent_run_id: Option<String>,
946    pub root_run_id: Option<String>,
947    pub stages: Vec<RunStageRecord>,
948    pub transitions: Vec<RunTransitionRecord>,
949    pub checkpoints: Vec<RunCheckpointRecord>,
950    pub pending_nodes: Vec<String>,
951    pub completed_nodes: Vec<String>,
952    pub child_runs: Vec<RunChildRecord>,
953    pub artifacts: Vec<ArtifactRecord>,
954    pub policy: CapabilityPolicy,
955    pub execution: Option<RunExecutionRecord>,
956    pub transcript: Option<serde_json::Value>,
957    pub replay_fixture: Option<ReplayFixture>,
958    pub metadata: BTreeMap<String, serde_json::Value>,
959    pub persisted_path: Option<String>,
960}
961
962#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
963#[serde(default)]
964pub struct RunChildRecord {
965    pub worker_id: String,
966    pub worker_name: String,
967    pub parent_stage_id: Option<String>,
968    pub task: String,
969    pub status: String,
970    pub started_at: String,
971    pub finished_at: Option<String>,
972    pub run_id: Option<String>,
973    pub run_path: Option<String>,
974    pub snapshot_path: Option<String>,
975    pub execution: Option<RunExecutionRecord>,
976}
977
978#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
979#[serde(default)]
980pub struct RunExecutionRecord {
981    pub cwd: Option<String>,
982    pub source_dir: Option<String>,
983    pub env: BTreeMap<String, String>,
984    pub adapter: Option<String>,
985    pub repo_path: Option<String>,
986    pub worktree_path: Option<String>,
987    pub branch: Option<String>,
988    pub base_ref: Option<String>,
989    pub cleanup: Option<String>,
990}
991
992#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
993#[serde(default)]
994pub struct WorkflowValidationReport {
995    pub valid: bool,
996    pub errors: Vec<String>,
997    pub warnings: Vec<String>,
998    pub reachable_nodes: Vec<String>,
999}
1000
1001fn parse_json_value<T: for<'de> Deserialize<'de>>(value: &VmValue) -> Result<T, VmError> {
1002    serde_json::from_value(vm_value_to_json(value))
1003        .map_err(|e| VmError::Runtime(format!("orchestration parse error: {e}")))
1004}
1005
1006pub fn normalize_workflow_value(value: &VmValue) -> Result<WorkflowGraph, VmError> {
1007    let mut graph: WorkflowGraph = parse_json_value(value)?;
1008    let as_dict = value.as_dict().cloned().unwrap_or_default();
1009
1010    if graph.nodes.is_empty() {
1011        for key in ["act", "verify", "repair"] {
1012            if let Some(node_value) = as_dict.get(key) {
1013                let mut node: WorkflowNode = parse_json_value(node_value)?;
1014                let raw_node = node_value.as_dict().cloned().unwrap_or_default();
1015                node.id = Some(key.to_string());
1016                if node.kind.is_empty() {
1017                    node.kind = if key == "verify" {
1018                        "verify".to_string()
1019                    } else {
1020                        "stage".to_string()
1021                    };
1022                }
1023                if node.model_policy.provider.is_none() {
1024                    node.model_policy.provider = as_dict
1025                        .get("provider")
1026                        .map(|value| value.display())
1027                        .filter(|value| !value.is_empty());
1028                }
1029                if node.model_policy.model.is_none() {
1030                    node.model_policy.model = as_dict
1031                        .get("model")
1032                        .map(|value| value.display())
1033                        .filter(|value| !value.is_empty());
1034                }
1035                if node.model_policy.model_tier.is_none() {
1036                    node.model_policy.model_tier = as_dict
1037                        .get("model_tier")
1038                        .or_else(|| as_dict.get("tier"))
1039                        .map(|value| value.display())
1040                        .filter(|value| !value.is_empty());
1041                }
1042                if node.model_policy.temperature.is_none() {
1043                    node.model_policy.temperature = as_dict.get("temperature").and_then(|value| {
1044                        if let VmValue::Float(number) = value {
1045                            Some(*number)
1046                        } else {
1047                            value.as_int().map(|number| number as f64)
1048                        }
1049                    });
1050                }
1051                if node.model_policy.max_tokens.is_none() {
1052                    node.model_policy.max_tokens =
1053                        as_dict.get("max_tokens").and_then(|value| value.as_int());
1054                }
1055                if node.mode.is_none() {
1056                    node.mode = as_dict
1057                        .get("mode")
1058                        .map(|value| value.display())
1059                        .filter(|value| !value.is_empty());
1060                }
1061                if key == "verify"
1062                    && node.verify.is_none()
1063                    && (raw_node.contains_key("assert_text")
1064                        || raw_node.contains_key("command")
1065                        || raw_node.contains_key("expect_status")
1066                        || raw_node.contains_key("expect_text"))
1067                {
1068                    node.verify = Some(serde_json::json!({
1069                        "assert_text": raw_node.get("assert_text").map(vm_value_to_json),
1070                        "command": raw_node.get("command").map(vm_value_to_json),
1071                        "expect_status": raw_node.get("expect_status").map(vm_value_to_json),
1072                        "expect_text": raw_node.get("expect_text").map(vm_value_to_json),
1073                    }));
1074                }
1075                graph.nodes.insert(key.to_string(), node);
1076            }
1077        }
1078        if graph.entry.is_empty() && graph.nodes.contains_key("act") {
1079            graph.entry = "act".to_string();
1080        }
1081        if graph.edges.is_empty() && graph.nodes.contains_key("act") {
1082            if graph.nodes.contains_key("verify") {
1083                graph.edges.push(WorkflowEdge {
1084                    from: "act".to_string(),
1085                    to: "verify".to_string(),
1086                    branch: None,
1087                    label: None,
1088                });
1089            }
1090            if graph.nodes.contains_key("repair") {
1091                graph.edges.push(WorkflowEdge {
1092                    from: "verify".to_string(),
1093                    to: "repair".to_string(),
1094                    branch: Some("failed".to_string()),
1095                    label: None,
1096                });
1097                graph.edges.push(WorkflowEdge {
1098                    from: "repair".to_string(),
1099                    to: "verify".to_string(),
1100                    branch: Some("retry".to_string()),
1101                    label: None,
1102                });
1103            }
1104        }
1105    }
1106
1107    if graph.type_name.is_empty() {
1108        graph.type_name = "workflow_graph".to_string();
1109    }
1110    if graph.id.is_empty() {
1111        graph.id = new_id("workflow");
1112    }
1113    if graph.version == 0 {
1114        graph.version = 1;
1115    }
1116    if graph.entry.is_empty() {
1117        graph.entry = graph
1118            .nodes
1119            .keys()
1120            .next()
1121            .cloned()
1122            .unwrap_or_else(|| "act".to_string());
1123    }
1124    for (node_id, node) in &mut graph.nodes {
1125        if node.id.is_none() {
1126            node.id = Some(node_id.clone());
1127        }
1128        if node.kind.is_empty() {
1129            node.kind = "stage".to_string();
1130        }
1131        if node.join_policy.strategy.is_empty() {
1132            node.join_policy.strategy = "all".to_string();
1133        }
1134        if node.reduce_policy.strategy.is_empty() {
1135            node.reduce_policy.strategy = "concat".to_string();
1136        }
1137        if node.output_contract.output_kinds.is_empty() {
1138            node.output_contract.output_kinds = vec![match node.kind.as_str() {
1139                "verify" => "verification_result".to_string(),
1140                "reduce" => node
1141                    .reduce_policy
1142                    .output_kind
1143                    .clone()
1144                    .unwrap_or_else(|| "summary".to_string()),
1145                "map" => node
1146                    .map_policy
1147                    .output_kind
1148                    .clone()
1149                    .unwrap_or_else(|| "artifact".to_string()),
1150                "escalation" => "plan".to_string(),
1151                _ => "artifact".to_string(),
1152            }];
1153        }
1154        if node.retry_policy.max_attempts == 0 {
1155            node.retry_policy.max_attempts = 1;
1156        }
1157    }
1158    Ok(graph)
1159}
1160
1161pub fn validate_workflow(
1162    graph: &WorkflowGraph,
1163    ceiling: Option<&CapabilityPolicy>,
1164) -> WorkflowValidationReport {
1165    let mut errors = Vec::new();
1166    let mut warnings = Vec::new();
1167
1168    if !graph.nodes.contains_key(&graph.entry) {
1169        errors.push(format!("entry node does not exist: {}", graph.entry));
1170    }
1171
1172    let node_ids: BTreeSet<String> = graph.nodes.keys().cloned().collect();
1173    for edge in &graph.edges {
1174        if !node_ids.contains(&edge.from) {
1175            errors.push(format!("edge.from references unknown node: {}", edge.from));
1176        }
1177        if !node_ids.contains(&edge.to) {
1178            errors.push(format!("edge.to references unknown node: {}", edge.to));
1179        }
1180    }
1181
1182    let reachable_nodes = reachable_nodes(graph);
1183    for node_id in &node_ids {
1184        if !reachable_nodes.contains(node_id) {
1185            warnings.push(format!("node is unreachable: {node_id}"));
1186        }
1187    }
1188
1189    for (node_id, node) in &graph.nodes {
1190        let incoming = graph
1191            .edges
1192            .iter()
1193            .filter(|edge| edge.to == *node_id)
1194            .count();
1195        let outgoing: Vec<&WorkflowEdge> = graph
1196            .edges
1197            .iter()
1198            .filter(|edge| edge.from == *node_id)
1199            .collect();
1200        if let Some(min_inputs) = node.input_contract.min_inputs {
1201            if let Some(max_inputs) = node.input_contract.max_inputs {
1202                if min_inputs > max_inputs {
1203                    errors.push(format!(
1204                        "node {node_id}: input contract min_inputs exceeds max_inputs"
1205                    ));
1206                }
1207            }
1208        }
1209        match node.kind.as_str() {
1210            "condition" => {
1211                let has_true = outgoing
1212                    .iter()
1213                    .any(|edge| edge.branch.as_deref() == Some("true"));
1214                let has_false = outgoing
1215                    .iter()
1216                    .any(|edge| edge.branch.as_deref() == Some("false"));
1217                if !has_true || !has_false {
1218                    errors.push(format!(
1219                        "node {node_id}: condition nodes require both 'true' and 'false' branch edges"
1220                    ));
1221                }
1222            }
1223            "fork" => {
1224                if outgoing.len() < 2 {
1225                    errors.push(format!(
1226                        "node {node_id}: fork nodes require at least two outgoing edges"
1227                    ));
1228                }
1229            }
1230            "join" => {
1231                if incoming < 2 {
1232                    warnings.push(format!(
1233                        "node {node_id}: join node has fewer than two incoming edges"
1234                    ));
1235                }
1236            }
1237            "map" => {
1238                if node.map_policy.items.is_empty()
1239                    && node.map_policy.item_artifact_kind.is_none()
1240                    && node.input_contract.input_kinds.is_empty()
1241                {
1242                    errors.push(format!(
1243                        "node {node_id}: map nodes require items, item_artifact_kind, or input_contract.input_kinds"
1244                    ));
1245                }
1246            }
1247            "reduce" => {
1248                if node.input_contract.input_kinds.is_empty() {
1249                    warnings.push(format!(
1250                        "node {node_id}: reduce node has no input_contract.input_kinds; it will consume all available artifacts"
1251                    ));
1252                }
1253            }
1254            _ => {}
1255        }
1256    }
1257
1258    if let Some(ceiling) = ceiling {
1259        if let Err(error) = ceiling.intersect(&graph.capability_policy) {
1260            errors.push(error);
1261        }
1262        for (node_id, node) in &graph.nodes {
1263            if let Err(error) = ceiling.intersect(&node.capability_policy) {
1264                errors.push(format!("node {node_id}: {error}"));
1265            }
1266        }
1267    }
1268
1269    WorkflowValidationReport {
1270        valid: errors.is_empty(),
1271        errors,
1272        warnings,
1273        reachable_nodes: reachable_nodes.into_iter().collect(),
1274    }
1275}
1276
1277fn reachable_nodes(graph: &WorkflowGraph) -> BTreeSet<String> {
1278    let mut seen = BTreeSet::new();
1279    let mut stack = vec![graph.entry.clone()];
1280    while let Some(node_id) = stack.pop() {
1281        if !seen.insert(node_id.clone()) {
1282            continue;
1283        }
1284        for edge in graph.edges.iter().filter(|edge| edge.from == node_id) {
1285            stack.push(edge.to.clone());
1286        }
1287    }
1288    seen
1289}
1290
1291pub fn select_artifacts(
1292    mut artifacts: Vec<ArtifactRecord>,
1293    policy: &ContextPolicy,
1294) -> Vec<ArtifactRecord> {
1295    artifacts.retain(|artifact| {
1296        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
1297            && !policy.exclude_kinds.contains(&artifact.kind)
1298            && (policy.include_stages.is_empty()
1299                || artifact
1300                    .stage
1301                    .as_ref()
1302                    .is_some_and(|stage| policy.include_stages.contains(stage)))
1303    });
1304    artifacts.sort_by(|a, b| {
1305        let b_pinned = policy.pinned_ids.contains(&b.id);
1306        let a_pinned = policy.pinned_ids.contains(&a.id);
1307        b_pinned
1308            .cmp(&a_pinned)
1309            .then_with(|| {
1310                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
1311                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
1312                b_prio_kind.cmp(&a_prio_kind)
1313            })
1314            .then_with(|| {
1315                b.priority
1316                    .unwrap_or_default()
1317                    .cmp(&a.priority.unwrap_or_default())
1318            })
1319            .then_with(|| {
1320                if policy.prefer_fresh {
1321                    freshness_rank(b.freshness.as_deref())
1322                        .cmp(&freshness_rank(a.freshness.as_deref()))
1323                } else {
1324                    std::cmp::Ordering::Equal
1325                }
1326            })
1327            .then_with(|| {
1328                if policy.prefer_recent {
1329                    b.created_at.cmp(&a.created_at)
1330                } else {
1331                    std::cmp::Ordering::Equal
1332                }
1333            })
1334            .then_with(|| {
1335                b.relevance
1336                    .partial_cmp(&a.relevance)
1337                    .unwrap_or(std::cmp::Ordering::Equal)
1338            })
1339            .then_with(|| {
1340                a.estimated_tokens
1341                    .unwrap_or(usize::MAX)
1342                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
1343            })
1344    });
1345
1346    let mut selected = Vec::new();
1347    let mut used_tokens = 0usize;
1348    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
1349    let effective_max_tokens = policy
1350        .max_tokens
1351        .map(|max| max.saturating_sub(reserve_tokens));
1352    for artifact in artifacts {
1353        if let Some(max_artifacts) = policy.max_artifacts {
1354            if selected.len() >= max_artifacts {
1355                break;
1356            }
1357        }
1358        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
1359        if let Some(max_tokens) = effective_max_tokens {
1360            if used_tokens + next_tokens > max_tokens {
1361                continue;
1362            }
1363        }
1364        used_tokens += next_tokens;
1365        selected.push(artifact);
1366    }
1367    selected
1368}
1369
1370pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
1371    let mut parts = Vec::new();
1372    for artifact in artifacts {
1373        let title = artifact
1374            .title
1375            .clone()
1376            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
1377        let body = artifact
1378            .text
1379            .clone()
1380            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
1381            .unwrap_or_default();
1382        match policy.render.as_deref() {
1383            Some("json") => {
1384                parts.push(
1385                    serde_json::json!({
1386                        "id": artifact.id,
1387                        "kind": artifact.kind,
1388                        "title": title,
1389                        "source": artifact.source,
1390                        "freshness": artifact.freshness,
1391                        "priority": artifact.priority,
1392                        "text": body,
1393                    })
1394                    .to_string(),
1395                );
1396            }
1397            _ => parts.push(format!(
1398                "[{title}] kind={} source={} freshness={} priority={}\n{}",
1399                artifact.kind,
1400                artifact
1401                    .source
1402                    .clone()
1403                    .unwrap_or_else(|| "unknown".to_string()),
1404                artifact
1405                    .freshness
1406                    .clone()
1407                    .unwrap_or_else(|| "normal".to_string()),
1408                artifact.priority.unwrap_or_default(),
1409                body
1410            )),
1411        }
1412    }
1413    parts.join("\n\n")
1414}
1415
1416pub fn normalize_artifact(value: &VmValue) -> Result<ArtifactRecord, VmError> {
1417    let artifact: ArtifactRecord = parse_json_value(value)?;
1418    Ok(artifact.normalize())
1419}
1420
1421pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1422    let mut run: RunRecord = parse_json_value(value)?;
1423    if run.type_name.is_empty() {
1424        run.type_name = "run_record".to_string();
1425    }
1426    if run.id.is_empty() {
1427        run.id = new_id("run");
1428    }
1429    if run.started_at.is_empty() {
1430        run.started_at = now_rfc3339();
1431    }
1432    if run.status.is_empty() {
1433        run.status = "running".to_string();
1434    }
1435    if run.root_run_id.is_none() {
1436        run.root_run_id = Some(run.id.clone());
1437    }
1438    if run.replay_fixture.is_none() {
1439        run.replay_fixture = Some(replay_fixture_from_run(&run));
1440    }
1441    Ok(run)
1442}
1443
1444pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1445    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1446    if manifest.type_name.is_empty() {
1447        manifest.type_name = "eval_suite_manifest".to_string();
1448    }
1449    if manifest.id.is_empty() {
1450        manifest.id = new_id("eval_suite");
1451    }
1452    Ok(manifest)
1453}
1454
1455fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1456    let content = std::fs::read_to_string(path)
1457        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1458    serde_json::from_str(&content)
1459        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1460}
1461
1462fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1463    let path_buf = PathBuf::from(path);
1464    if path_buf.is_absolute() {
1465        path_buf
1466    } else if let Some(base_dir) = base_dir {
1467        base_dir.join(path_buf)
1468    } else {
1469        path_buf
1470    }
1471}
1472
1473pub fn evaluate_run_suite_manifest(
1474    manifest: &EvalSuiteManifest,
1475) -> Result<ReplayEvalSuiteReport, VmError> {
1476    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1477    let mut reports = Vec::new();
1478    for case in &manifest.cases {
1479        let run_path = resolve_manifest_path(base_dir, &case.run_path);
1480        let run = load_run_record(&run_path)?;
1481        let fixture = match &case.fixture_path {
1482            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1483            None => run
1484                .replay_fixture
1485                .clone()
1486                .unwrap_or_else(|| replay_fixture_from_run(&run)),
1487        };
1488        let eval = evaluate_run_against_fixture(&run, &fixture);
1489        let mut pass = eval.pass;
1490        let mut failures = eval.failures;
1491        let comparison = match &case.compare_to {
1492            Some(path) => {
1493                let baseline_path = resolve_manifest_path(base_dir, path);
1494                let baseline = load_run_record(&baseline_path)?;
1495                let diff = diff_run_records(&baseline, &run);
1496                if !diff.identical {
1497                    pass = false;
1498                    failures.push(format!(
1499                        "run differs from baseline {} with {} stage changes",
1500                        baseline_path.display(),
1501                        diff.stage_diffs.len()
1502                    ));
1503                }
1504                Some(diff)
1505            }
1506            None => None,
1507        };
1508        reports.push(ReplayEvalCaseReport {
1509            run_id: run.id.clone(),
1510            workflow_id: run.workflow_id.clone(),
1511            label: case.label.clone(),
1512            pass,
1513            failures,
1514            stage_count: eval.stage_count,
1515            source_path: Some(run_path.display().to_string()),
1516            comparison,
1517        });
1518    }
1519    let total = reports.len();
1520    let passed = reports.iter().filter(|report| report.pass).count();
1521    let failed = total.saturating_sub(passed);
1522    Ok(ReplayEvalSuiteReport {
1523        pass: failed == 0,
1524        total,
1525        passed,
1526        failed,
1527        cases: reports,
1528    })
1529}
1530
1531pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
1532    let before_lines: Vec<&str> = before.lines().collect();
1533    let after_lines: Vec<&str> = after.lines().collect();
1534    let mut table = vec![vec![0usize; after_lines.len() + 1]; before_lines.len() + 1];
1535    for i in (0..before_lines.len()).rev() {
1536        for j in (0..after_lines.len()).rev() {
1537            table[i][j] = if before_lines[i] == after_lines[j] {
1538                table[i + 1][j + 1] + 1
1539            } else {
1540                table[i + 1][j].max(table[i][j + 1])
1541            };
1542        }
1543    }
1544
1545    let mut diff = String::new();
1546    let file = path.unwrap_or("artifact");
1547    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
1548    let mut i = 0;
1549    let mut j = 0;
1550    while i < before_lines.len() && j < after_lines.len() {
1551        if before_lines[i] == after_lines[j] {
1552            diff.push_str(&format!(" {}\n", before_lines[i]));
1553            i += 1;
1554            j += 1;
1555        } else if table[i + 1][j] >= table[i][j + 1] {
1556            diff.push_str(&format!("-{}\n", before_lines[i]));
1557            i += 1;
1558        } else {
1559            diff.push_str(&format!("+{}\n", after_lines[j]));
1560            j += 1;
1561        }
1562    }
1563    while i < before_lines.len() {
1564        diff.push_str(&format!("-{}\n", before_lines[i]));
1565        i += 1;
1566    }
1567    while j < after_lines.len() {
1568        diff.push_str(&format!("+{}\n", after_lines[j]));
1569        j += 1;
1570    }
1571    diff
1572}
1573
1574pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
1575    let path = path
1576        .map(PathBuf::from)
1577        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
1578    if let Some(parent) = path.parent() {
1579        std::fs::create_dir_all(parent)
1580            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
1581    }
1582    let json = serde_json::to_string_pretty(run)
1583        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
1584    std::fs::write(&path, json)
1585        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
1586    Ok(path.to_string_lossy().to_string())
1587}
1588
1589pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
1590    let content = std::fs::read_to_string(path)
1591        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
1592    serde_json::from_str(&content)
1593        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))
1594}
1595
1596pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
1597    ReplayFixture {
1598        type_name: "replay_fixture".to_string(),
1599        id: new_id("fixture"),
1600        source_run_id: run.id.clone(),
1601        workflow_id: run.workflow_id.clone(),
1602        workflow_name: run.workflow_name.clone(),
1603        created_at: now_rfc3339(),
1604        expected_status: run.status.clone(),
1605        stage_assertions: run
1606            .stages
1607            .iter()
1608            .map(|stage| ReplayStageAssertion {
1609                node_id: stage.node_id.clone(),
1610                expected_status: stage.status.clone(),
1611                expected_outcome: stage.outcome.clone(),
1612                expected_branch: stage.branch.clone(),
1613                required_artifact_kinds: stage
1614                    .artifacts
1615                    .iter()
1616                    .map(|artifact| artifact.kind.clone())
1617                    .collect(),
1618                visible_text_contains: stage
1619                    .visible_text
1620                    .as_ref()
1621                    .filter(|text| !text.is_empty())
1622                    .map(|text| text.chars().take(80).collect()),
1623            })
1624            .collect(),
1625    }
1626}
1627
1628pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
1629    let mut failures = Vec::new();
1630    if run.status != fixture.expected_status {
1631        failures.push(format!(
1632            "run status mismatch: expected {}, got {}",
1633            fixture.expected_status, run.status
1634        ));
1635    }
1636    for assertion in &fixture.stage_assertions {
1637        let Some(stage) = run
1638            .stages
1639            .iter()
1640            .find(|stage| stage.node_id == assertion.node_id)
1641        else {
1642            failures.push(format!("missing stage {}", assertion.node_id));
1643            continue;
1644        };
1645        if stage.status != assertion.expected_status {
1646            failures.push(format!(
1647                "stage {} status mismatch: expected {}, got {}",
1648                assertion.node_id, assertion.expected_status, stage.status
1649            ));
1650        }
1651        if stage.outcome != assertion.expected_outcome {
1652            failures.push(format!(
1653                "stage {} outcome mismatch: expected {}, got {}",
1654                assertion.node_id, assertion.expected_outcome, stage.outcome
1655            ));
1656        }
1657        if stage.branch != assertion.expected_branch {
1658            failures.push(format!(
1659                "stage {} branch mismatch: expected {:?}, got {:?}",
1660                assertion.node_id, assertion.expected_branch, stage.branch
1661            ));
1662        }
1663        for required_kind in &assertion.required_artifact_kinds {
1664            if !stage
1665                .artifacts
1666                .iter()
1667                .any(|artifact| &artifact.kind == required_kind)
1668            {
1669                failures.push(format!(
1670                    "stage {} missing artifact kind {}",
1671                    assertion.node_id, required_kind
1672                ));
1673            }
1674        }
1675        if let Some(snippet) = &assertion.visible_text_contains {
1676            let actual = stage.visible_text.clone().unwrap_or_default();
1677            if !actual.contains(snippet) {
1678                failures.push(format!(
1679                    "stage {} visible text does not contain expected snippet {:?}",
1680                    assertion.node_id, snippet
1681                ));
1682            }
1683        }
1684    }
1685
1686    ReplayEvalReport {
1687        pass: failures.is_empty(),
1688        failures,
1689        stage_count: run.stages.len(),
1690    }
1691}
1692
1693pub fn evaluate_run_suite(
1694    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
1695) -> ReplayEvalSuiteReport {
1696    let mut reports = Vec::new();
1697    for (run, fixture, source_path) in cases {
1698        let report = evaluate_run_against_fixture(&run, &fixture);
1699        reports.push(ReplayEvalCaseReport {
1700            run_id: run.id.clone(),
1701            workflow_id: run.workflow_id.clone(),
1702            label: None,
1703            pass: report.pass,
1704            failures: report.failures,
1705            stage_count: report.stage_count,
1706            source_path,
1707            comparison: None,
1708        });
1709    }
1710    let total = reports.len();
1711    let passed = reports.iter().filter(|report| report.pass).count();
1712    let failed = total.saturating_sub(passed);
1713    ReplayEvalSuiteReport {
1714        pass: failed == 0,
1715        total,
1716        passed,
1717        failed,
1718        cases: reports,
1719    }
1720}
1721
1722pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
1723    let mut stage_diffs = Vec::new();
1724    let mut all_node_ids = BTreeSet::new();
1725    all_node_ids.extend(left.stages.iter().map(|stage| stage.node_id.clone()));
1726    all_node_ids.extend(right.stages.iter().map(|stage| stage.node_id.clone()));
1727
1728    for node_id in all_node_ids {
1729        let left_stage = left.stages.iter().find(|stage| stage.node_id == node_id);
1730        let right_stage = right.stages.iter().find(|stage| stage.node_id == node_id);
1731        match (left_stage, right_stage) {
1732            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
1733                node_id,
1734                change: "removed".to_string(),
1735                details: vec!["stage missing from right run".to_string()],
1736            }),
1737            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
1738                node_id,
1739                change: "added".to_string(),
1740                details: vec!["stage missing from left run".to_string()],
1741            }),
1742            (Some(left_stage), Some(right_stage)) => {
1743                let mut details = Vec::new();
1744                if left_stage.status != right_stage.status {
1745                    details.push(format!(
1746                        "status: {} -> {}",
1747                        left_stage.status, right_stage.status
1748                    ));
1749                }
1750                if left_stage.outcome != right_stage.outcome {
1751                    details.push(format!(
1752                        "outcome: {} -> {}",
1753                        left_stage.outcome, right_stage.outcome
1754                    ));
1755                }
1756                if left_stage.branch != right_stage.branch {
1757                    details.push(format!(
1758                        "branch: {:?} -> {:?}",
1759                        left_stage.branch, right_stage.branch
1760                    ));
1761                }
1762                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
1763                {
1764                    details.push(format!(
1765                        "produced_artifacts: {} -> {}",
1766                        left_stage.produced_artifact_ids.len(),
1767                        right_stage.produced_artifact_ids.len()
1768                    ));
1769                }
1770                if left_stage.artifacts.len() != right_stage.artifacts.len() {
1771                    details.push(format!(
1772                        "artifact_records: {} -> {}",
1773                        left_stage.artifacts.len(),
1774                        right_stage.artifacts.len()
1775                    ));
1776                }
1777                if !details.is_empty() {
1778                    stage_diffs.push(RunStageDiffRecord {
1779                        node_id,
1780                        change: "changed".to_string(),
1781                        details,
1782                    });
1783                }
1784            }
1785            (None, None) => {}
1786        }
1787    }
1788
1789    let status_changed = left.status != right.status;
1790    let identical = !status_changed
1791        && stage_diffs.is_empty()
1792        && left.transitions.len() == right.transitions.len()
1793        && left.artifacts.len() == right.artifacts.len()
1794        && left.checkpoints.len() == right.checkpoints.len();
1795
1796    RunDiffReport {
1797        left_run_id: left.id.clone(),
1798        right_run_id: right.id.clone(),
1799        identical,
1800        status_changed,
1801        left_status: left.status.clone(),
1802        right_status: right.status.clone(),
1803        stage_diffs,
1804        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
1805        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
1806        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
1807    }
1808}
1809
1810pub fn push_execution_policy(policy: CapabilityPolicy) {
1811    EXECUTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
1812}
1813
1814pub fn pop_execution_policy() {
1815    EXECUTION_POLICY_STACK.with(|stack| {
1816        stack.borrow_mut().pop();
1817    });
1818}
1819
1820pub fn current_execution_policy() -> Option<CapabilityPolicy> {
1821    EXECUTION_POLICY_STACK.with(|stack| stack.borrow().last().cloned())
1822}
1823
1824fn policy_allows_tool(policy: &CapabilityPolicy, tool: &str) -> bool {
1825    policy.tools.is_empty() || policy.tools.iter().any(|allowed| allowed == tool)
1826}
1827
1828fn policy_allows_capability(policy: &CapabilityPolicy, capability: &str, op: &str) -> bool {
1829    policy.capabilities.is_empty()
1830        || policy
1831            .capabilities
1832            .get(capability)
1833            .is_some_and(|ops| ops.is_empty() || ops.iter().any(|allowed| allowed == op))
1834}
1835
1836fn policy_allows_side_effect(policy: &CapabilityPolicy, requested: &str) -> bool {
1837    fn rank(v: &str) -> usize {
1838        match v {
1839            "none" => 0,
1840            "read_only" => 1,
1841            "workspace_write" => 2,
1842            "process_exec" => 3,
1843            "network" => 4,
1844            _ => 5,
1845        }
1846    }
1847    policy
1848        .side_effect_level
1849        .as_ref()
1850        .map(|allowed| rank(allowed) >= rank(requested))
1851        .unwrap_or(true)
1852}
1853
1854fn reject_policy(reason: String) -> Result<(), VmError> {
1855    Err(VmError::CategorizedError {
1856        message: reason,
1857        category: crate::value::ErrorCategory::ToolRejected,
1858    })
1859}
1860
1861pub fn enforce_current_policy_for_builtin(name: &str, args: &[VmValue]) -> Result<(), VmError> {
1862    let Some(policy) = current_execution_policy() else {
1863        return Ok(());
1864    };
1865    match name {
1866        "read" | "read_file" => {
1867            if !policy_allows_tool(&policy, name)
1868                || !policy_allows_capability(&policy, "workspace", "read_text")
1869            {
1870                return reject_policy(format!(
1871                    "builtin '{name}' exceeds workspace.read_text ceiling"
1872                ));
1873            }
1874        }
1875        "search" | "list_dir" => {
1876            if !policy_allows_tool(&policy, name)
1877                || !policy_allows_capability(&policy, "workspace", "list")
1878            {
1879                return reject_policy(format!("builtin '{name}' exceeds workspace.list ceiling"));
1880            }
1881        }
1882        "file_exists" | "stat" => {
1883            if !policy_allows_capability(&policy, "workspace", "exists") {
1884                return reject_policy(format!("builtin '{name}' exceeds workspace.exists ceiling"));
1885            }
1886        }
1887        "edit" | "write_file" | "append_file" | "mkdir" | "copy_file" => {
1888            if !policy_allows_tool(&policy, "edit")
1889                || !policy_allows_capability(&policy, "workspace", "write_text")
1890                || !policy_allows_side_effect(&policy, "workspace_write")
1891            {
1892                return reject_policy(format!("builtin '{name}' exceeds workspace write ceiling"));
1893            }
1894        }
1895        "delete_file" => {
1896            if !policy_allows_capability(&policy, "workspace", "delete")
1897                || !policy_allows_side_effect(&policy, "workspace_write")
1898            {
1899                return reject_policy(
1900                    "builtin 'delete_file' exceeds workspace.delete ceiling".to_string(),
1901                );
1902            }
1903        }
1904        "apply_edit" => {
1905            if !policy_allows_capability(&policy, "workspace", "apply_edit")
1906                || !policy_allows_side_effect(&policy, "workspace_write")
1907            {
1908                return reject_policy(
1909                    "builtin 'apply_edit' exceeds workspace.apply_edit ceiling".to_string(),
1910                );
1911            }
1912        }
1913        "exec" | "exec_at" | "shell" | "shell_at" | "run_command" => {
1914            if !policy_allows_tool(&policy, "run")
1915                || !policy_allows_capability(&policy, "process", "exec")
1916                || !policy_allows_side_effect(&policy, "process_exec")
1917            {
1918                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
1919            }
1920        }
1921        "http_get" | "http_post" | "http_put" | "http_patch" | "http_delete" | "http_request" => {
1922            if !policy_allows_side_effect(&policy, "network") {
1923                return reject_policy(format!("builtin '{name}' exceeds network ceiling"));
1924            }
1925        }
1926        "mcp_connect"
1927        | "mcp_call"
1928        | "mcp_list_tools"
1929        | "mcp_list_resources"
1930        | "mcp_list_resource_templates"
1931        | "mcp_read_resource"
1932        | "mcp_list_prompts"
1933        | "mcp_get_prompt"
1934        | "mcp_server_info"
1935        | "mcp_disconnect" => {
1936            if !policy_allows_tool(&policy, "run")
1937                || !policy_allows_capability(&policy, "process", "exec")
1938                || !policy_allows_side_effect(&policy, "process_exec")
1939            {
1940                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
1941            }
1942        }
1943        "host_invoke" => {
1944            let capability = args.first().map(|v| v.display()).unwrap_or_default();
1945            let op = args.get(1).map(|v| v.display()).unwrap_or_default();
1946            if !policy_allows_capability(&policy, &capability, &op) {
1947                return reject_policy(format!(
1948                    "host_invoke {capability}.{op} exceeds capability ceiling"
1949                ));
1950            }
1951            let requested_side_effect = match (capability.as_str(), op.as_str()) {
1952                ("workspace", "write_text" | "apply_edit" | "delete") => "workspace_write",
1953                ("process", "exec") => "process_exec",
1954                _ => "read_only",
1955            };
1956            if !policy_allows_side_effect(&policy, requested_side_effect) {
1957                return reject_policy(format!(
1958                    "host_invoke {capability}.{op} exceeds side-effect ceiling"
1959                ));
1960            }
1961        }
1962        _ => {}
1963    }
1964    Ok(())
1965}
1966
1967pub fn enforce_current_policy_for_bridge_builtin(name: &str) -> Result<(), VmError> {
1968    if current_execution_policy().is_some() {
1969        return reject_policy(format!(
1970            "bridged builtin '{name}' exceeds execution policy; declare an explicit capability/tool surface instead"
1971        ));
1972    }
1973    Ok(())
1974}
1975
1976pub fn enforce_current_policy_for_tool(tool_name: &str) -> Result<(), VmError> {
1977    let Some(policy) = current_execution_policy() else {
1978        return Ok(());
1979    };
1980    if !policy_allows_tool(&policy, tool_name) {
1981        return reject_policy(format!("tool '{tool_name}' exceeds tool ceiling"));
1982    }
1983    Ok(())
1984}
1985
1986fn compact_transcript(transcript: &VmValue, keep_last: usize) -> Option<VmValue> {
1987    let dict = transcript.as_dict()?;
1988    let messages = match dict.get("messages") {
1989        Some(VmValue::List(list)) => list.iter().cloned().collect::<Vec<_>>(),
1990        _ => Vec::new(),
1991    };
1992    let retained = messages
1993        .into_iter()
1994        .rev()
1995        .take(keep_last)
1996        .collect::<Vec<_>>()
1997        .into_iter()
1998        .rev()
1999        .collect::<Vec<_>>();
2000    let mut compacted = dict.clone();
2001    compacted.insert(
2002        "messages".to_string(),
2003        VmValue::List(Rc::new(retained.clone())),
2004    );
2005    compacted.insert(
2006        "events".to_string(),
2007        VmValue::List(Rc::new(
2008            crate::llm::helpers::transcript_events_from_messages(&retained),
2009        )),
2010    );
2011    Some(VmValue::Dict(Rc::new(compacted)))
2012}
2013
2014fn redact_transcript_visibility(transcript: &VmValue, visibility: Option<&str>) -> Option<VmValue> {
2015    let Some(visibility) = visibility else {
2016        return Some(transcript.clone());
2017    };
2018    if visibility != "public" && visibility != "public_only" {
2019        return Some(transcript.clone());
2020    }
2021    let dict = transcript.as_dict()?;
2022    let public_messages = match dict.get("messages") {
2023        Some(VmValue::List(list)) => list
2024            .iter()
2025            .filter(|message| {
2026                message
2027                    .as_dict()
2028                    .and_then(|d| d.get("role"))
2029                    .map(|v| v.display())
2030                    .map(|role| role != "tool_result")
2031                    .unwrap_or(true)
2032            })
2033            .cloned()
2034            .collect::<Vec<_>>(),
2035        _ => Vec::new(),
2036    };
2037    let public_events = match dict.get("events") {
2038        Some(VmValue::List(list)) => list
2039            .iter()
2040            .filter(|event| {
2041                event
2042                    .as_dict()
2043                    .and_then(|d| d.get("visibility"))
2044                    .map(|v| v.display())
2045                    .map(|value| value == "public")
2046                    .unwrap_or(true)
2047            })
2048            .cloned()
2049            .collect::<Vec<_>>(),
2050        _ => Vec::new(),
2051    };
2052    let mut redacted = dict.clone();
2053    redacted.insert(
2054        "messages".to_string(),
2055        VmValue::List(Rc::new(public_messages)),
2056    );
2057    redacted.insert("events".to_string(), VmValue::List(Rc::new(public_events)));
2058    Some(VmValue::Dict(Rc::new(redacted)))
2059}
2060
2061pub(crate) fn apply_input_transcript_policy(
2062    transcript: Option<VmValue>,
2063    policy: &TranscriptPolicy,
2064) -> Option<VmValue> {
2065    let mut transcript = transcript;
2066    match policy.mode.as_deref() {
2067        Some("reset") => return None,
2068        Some("fork") => {
2069            if let Some(VmValue::Dict(dict)) = transcript.as_ref() {
2070                let mut forked = dict.as_ref().clone();
2071                forked.insert(
2072                    "id".to_string(),
2073                    VmValue::String(Rc::from(new_id("transcript"))),
2074                );
2075                transcript = Some(VmValue::Dict(Rc::new(forked)));
2076            }
2077        }
2078        _ => {}
2079    }
2080    if policy.compact {
2081        let keep_last = policy.keep_last.unwrap_or(6);
2082        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2083    }
2084    transcript
2085}
2086
2087fn apply_output_transcript_policy(
2088    transcript: Option<VmValue>,
2089    policy: &TranscriptPolicy,
2090) -> Option<VmValue> {
2091    let mut transcript = transcript;
2092    if policy.compact {
2093        let keep_last = policy.keep_last.unwrap_or(6);
2094        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2095    }
2096    transcript.and_then(|value| redact_transcript_visibility(&value, policy.visibility.as_deref()))
2097}
2098
2099pub async fn execute_stage_node(
2100    node_id: &str,
2101    node: &WorkflowNode,
2102    task: &str,
2103    artifacts: &[ArtifactRecord],
2104    transcript: Option<VmValue>,
2105) -> Result<(serde_json::Value, Vec<ArtifactRecord>, Option<VmValue>), VmError> {
2106    let mut selection_policy = node.context_policy.clone();
2107    if selection_policy.include_kinds.is_empty() && !node.input_contract.input_kinds.is_empty() {
2108        selection_policy.include_kinds = node.input_contract.input_kinds.clone();
2109    }
2110    let selected = select_artifacts(artifacts.to_vec(), &selection_policy);
2111    let rendered_context = render_artifacts_context(&selected, &node.context_policy);
2112    let transcript = apply_input_transcript_policy(transcript, &node.transcript_policy);
2113    if node.input_contract.require_transcript && transcript.is_none() {
2114        return Err(VmError::Runtime(format!(
2115            "workflow stage {node_id} requires transcript input"
2116        )));
2117    }
2118    if let Some(min_inputs) = node.input_contract.min_inputs {
2119        if selected.len() < min_inputs {
2120            return Err(VmError::Runtime(format!(
2121                "workflow stage {node_id} requires at least {min_inputs} input artifacts"
2122            )));
2123        }
2124    }
2125    if let Some(max_inputs) = node.input_contract.max_inputs {
2126        if selected.len() > max_inputs {
2127            return Err(VmError::Runtime(format!(
2128                "workflow stage {node_id} accepts at most {max_inputs} input artifacts"
2129            )));
2130        }
2131    }
2132    let prompt = if rendered_context.is_empty() {
2133        task.to_string()
2134    } else {
2135        format!(
2136            "{rendered_context}\n\n{}:\n{task}",
2137            node.task_label
2138                .clone()
2139                .unwrap_or_else(|| "Task".to_string())
2140        )
2141    };
2142
2143    let mut options = BTreeMap::new();
2144    if let Some(provider) = &node.model_policy.provider {
2145        options.insert(
2146            "provider".to_string(),
2147            VmValue::String(Rc::from(provider.clone())),
2148        );
2149    }
2150    if let Some(model) = &node.model_policy.model {
2151        options.insert(
2152            "model".to_string(),
2153            VmValue::String(Rc::from(model.clone())),
2154        );
2155    }
2156    if let Some(model_tier) = &node.model_policy.model_tier {
2157        options.insert(
2158            "model_tier".to_string(),
2159            VmValue::String(Rc::from(model_tier.clone())),
2160        );
2161    }
2162    if let Some(temperature) = node.model_policy.temperature {
2163        options.insert("temperature".to_string(), VmValue::Float(temperature));
2164    }
2165    if let Some(max_tokens) = node.model_policy.max_tokens {
2166        options.insert("max_tokens".to_string(), VmValue::Int(max_tokens));
2167    }
2168    if !node.tools.is_empty() {
2169        options.insert(
2170            "tools".to_string(),
2171            VmValue::List(Rc::new(
2172                node.tools
2173                    .iter()
2174                    .map(|tool| VmValue::String(Rc::from(tool.clone())))
2175                    .collect(),
2176            )),
2177        );
2178    }
2179    if let Some(transcript) = transcript.clone() {
2180        options.insert("transcript".to_string(), transcript);
2181    }
2182
2183    let args = vec![
2184        VmValue::String(Rc::from(prompt)),
2185        node.system
2186            .clone()
2187            .map(|s| VmValue::String(Rc::from(s)))
2188            .unwrap_or(VmValue::Nil),
2189        VmValue::Dict(Rc::new(options)),
2190    ];
2191    let mut opts = extract_llm_options(&args)?;
2192
2193    let llm_result = if node.mode.as_deref() == Some("agent") || !node.tools.is_empty() {
2194        crate::llm::run_agent_loop_internal(
2195            &mut opts,
2196            crate::llm::AgentLoopConfig {
2197                persistent: true,
2198                max_iterations: 12,
2199                max_nudges: 3,
2200                nudge: None,
2201                tool_retries: 0,
2202                tool_backoff_ms: 1000,
2203                tool_format: "text".to_string(),
2204                auto_compact: None,
2205                policy: None,
2206                daemon: false,
2207            },
2208        )
2209        .await?
2210    } else {
2211        let result = vm_call_llm_full(&opts).await?;
2212        crate::llm::agent_loop_result_from_llm(&result, opts)
2213    };
2214
2215    let visible_text = llm_result["text"].as_str().unwrap_or_default().to_string();
2216    let transcript = llm_result
2217        .get("transcript")
2218        .cloned()
2219        .map(|value| crate::stdlib::json_to_vm_value(&value));
2220    let transcript = apply_output_transcript_policy(transcript, &node.transcript_policy);
2221    let output_kind = node
2222        .output_contract
2223        .output_kinds
2224        .first()
2225        .cloned()
2226        .unwrap_or_else(|| {
2227            if node.kind == "verify" {
2228                "verification_result".to_string()
2229            } else {
2230                "artifact".to_string()
2231            }
2232        });
2233    let mut metadata = BTreeMap::new();
2234    metadata.insert(
2235        "input_artifact_ids".to_string(),
2236        serde_json::json!(selected
2237            .iter()
2238            .map(|artifact| artifact.id.clone())
2239            .collect::<Vec<_>>()),
2240    );
2241    metadata.insert("node_kind".to_string(), serde_json::json!(node.kind));
2242    let artifact = ArtifactRecord {
2243        type_name: "artifact".to_string(),
2244        id: new_id("artifact"),
2245        kind: output_kind,
2246        title: Some(format!("stage {node_id} output")),
2247        text: Some(visible_text),
2248        data: Some(llm_result.clone()),
2249        source: Some(node_id.to_string()),
2250        created_at: now_rfc3339(),
2251        freshness: Some("fresh".to_string()),
2252        priority: None,
2253        lineage: selected
2254            .iter()
2255            .map(|artifact| artifact.id.clone())
2256            .collect(),
2257        relevance: Some(1.0),
2258        estimated_tokens: None,
2259        stage: Some(node_id.to_string()),
2260        metadata,
2261    }
2262    .normalize();
2263
2264    Ok((llm_result, vec![artifact], transcript))
2265}
2266
2267pub fn next_nodes_for(
2268    graph: &WorkflowGraph,
2269    current: &str,
2270    branch: Option<&str>,
2271) -> Vec<WorkflowEdge> {
2272    let mut matching: Vec<WorkflowEdge> = graph
2273        .edges
2274        .iter()
2275        .filter(|edge| edge.from == current && edge.branch.as_deref() == branch)
2276        .cloned()
2277        .collect();
2278    if matching.is_empty() {
2279        matching = graph
2280            .edges
2281            .iter()
2282            .filter(|edge| edge.from == current && edge.branch.is_none())
2283            .cloned()
2284            .collect();
2285    }
2286    matching
2287}
2288
2289pub fn next_node_for(graph: &WorkflowGraph, current: &str, branch: &str) -> Option<String> {
2290    next_nodes_for(graph, current, Some(branch))
2291        .into_iter()
2292        .next()
2293        .map(|edge| edge.to)
2294}
2295
2296pub fn append_audit_entry(
2297    graph: &mut WorkflowGraph,
2298    op: &str,
2299    node_id: Option<String>,
2300    reason: Option<String>,
2301    metadata: BTreeMap<String, serde_json::Value>,
2302) {
2303    graph.audit_log.push(WorkflowAuditEntry {
2304        id: new_id("audit"),
2305        op: op.to_string(),
2306        node_id,
2307        timestamp: now_rfc3339(),
2308        reason,
2309        metadata,
2310    });
2311}
2312
2313pub fn builtin_ceiling() -> CapabilityPolicy {
2314    CapabilityPolicy {
2315        tools: vec![
2316            "read".to_string(),
2317            "read_file".to_string(),
2318            "search".to_string(),
2319            "edit".to_string(),
2320            "run".to_string(),
2321            "exec".to_string(),
2322            "outline".to_string(),
2323            "list_directory".to_string(),
2324            "lsp_hover".to_string(),
2325            "lsp_definition".to_string(),
2326            "lsp_references".to_string(),
2327            "web_search".to_string(),
2328            "web_fetch".to_string(),
2329        ],
2330        capabilities: BTreeMap::from([
2331            (
2332                "workspace".to_string(),
2333                vec![
2334                    "read_text".to_string(),
2335                    "write_text".to_string(),
2336                    "apply_edit".to_string(),
2337                    "delete".to_string(),
2338                    "exists".to_string(),
2339                    "list".to_string(),
2340                ],
2341            ),
2342            ("process".to_string(), vec!["exec".to_string()]),
2343        ]),
2344        workspace_roots: Vec::new(),
2345        side_effect_level: Some("network".to_string()),
2346        recursion_limit: Some(8),
2347        tool_arg_constraints: Vec::new(),
2348    }
2349}
2350
2351#[cfg(test)]
2352mod tests {
2353    use super::*;
2354
2355    #[test]
2356    fn capability_intersection_rejects_privilege_expansion() {
2357        let ceiling = CapabilityPolicy {
2358            tools: vec!["read".to_string()],
2359            side_effect_level: Some("read_only".to_string()),
2360            recursion_limit: Some(2),
2361            ..Default::default()
2362        };
2363        let requested = CapabilityPolicy {
2364            tools: vec!["read".to_string(), "edit".to_string()],
2365            ..Default::default()
2366        };
2367        let error = ceiling.intersect(&requested).unwrap_err();
2368        assert!(error.contains("host ceiling"));
2369    }
2370
2371    #[test]
2372    fn active_execution_policy_rejects_unknown_bridge_builtin() {
2373        push_execution_policy(CapabilityPolicy {
2374            tools: vec!["read".to_string()],
2375            capabilities: BTreeMap::from([(
2376                "workspace".to_string(),
2377                vec!["read_text".to_string()],
2378            )]),
2379            side_effect_level: Some("read_only".to_string()),
2380            recursion_limit: Some(1),
2381            ..Default::default()
2382        });
2383        let error = enforce_current_policy_for_bridge_builtin("custom_host_builtin").unwrap_err();
2384        pop_execution_policy();
2385        assert!(matches!(
2386            error,
2387            VmError::CategorizedError {
2388                category: crate::value::ErrorCategory::ToolRejected,
2389                ..
2390            }
2391        ));
2392    }
2393
2394    #[test]
2395    fn active_execution_policy_rejects_mcp_escape_hatch() {
2396        push_execution_policy(CapabilityPolicy {
2397            tools: vec!["read".to_string()],
2398            capabilities: BTreeMap::from([(
2399                "workspace".to_string(),
2400                vec!["read_text".to_string()],
2401            )]),
2402            side_effect_level: Some("read_only".to_string()),
2403            recursion_limit: Some(1),
2404            ..Default::default()
2405        });
2406        let error = enforce_current_policy_for_builtin("mcp_connect", &[]).unwrap_err();
2407        pop_execution_policy();
2408        assert!(matches!(
2409            error,
2410            VmError::CategorizedError {
2411                category: crate::value::ErrorCategory::ToolRejected,
2412                ..
2413            }
2414        ));
2415    }
2416
2417    #[test]
2418    fn workflow_normalization_upgrades_legacy_act_verify_repair_shape() {
2419        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
2420            "name": "legacy",
2421            "act": {"mode": "llm"},
2422            "verify": {"kind": "verify"},
2423            "repair": {"mode": "agent"},
2424        }));
2425        let graph = normalize_workflow_value(&value).unwrap();
2426        assert_eq!(graph.type_name, "workflow_graph");
2427        assert!(graph.nodes.contains_key("act"));
2428        assert!(graph.nodes.contains_key("verify"));
2429        assert!(graph.nodes.contains_key("repair"));
2430        assert_eq!(graph.entry, "act");
2431    }
2432
2433    #[test]
2434    fn artifact_selection_honors_budget_and_priority() {
2435        let policy = ContextPolicy {
2436            max_artifacts: Some(2),
2437            max_tokens: Some(30),
2438            prefer_recent: true,
2439            prefer_fresh: true,
2440            prioritize_kinds: vec!["verification_result".to_string()],
2441            ..Default::default()
2442        };
2443        let artifacts = vec![
2444            ArtifactRecord {
2445                type_name: "artifact".to_string(),
2446                id: "a".to_string(),
2447                kind: "summary".to_string(),
2448                text: Some("short".to_string()),
2449                relevance: Some(0.9),
2450                created_at: now_rfc3339(),
2451                ..Default::default()
2452            }
2453            .normalize(),
2454            ArtifactRecord {
2455                type_name: "artifact".to_string(),
2456                id: "b".to_string(),
2457                kind: "summary".to_string(),
2458                text: Some("this is a much larger artifact body".to_string()),
2459                relevance: Some(1.0),
2460                created_at: now_rfc3339(),
2461                ..Default::default()
2462            }
2463            .normalize(),
2464            ArtifactRecord {
2465                type_name: "artifact".to_string(),
2466                id: "c".to_string(),
2467                kind: "summary".to_string(),
2468                text: Some("tiny".to_string()),
2469                relevance: Some(0.5),
2470                created_at: now_rfc3339(),
2471                ..Default::default()
2472            }
2473            .normalize(),
2474        ];
2475        let selected = select_artifacts(artifacts, &policy);
2476        assert_eq!(selected.len(), 2);
2477        assert!(selected.iter().all(|artifact| artifact.kind == "summary"));
2478    }
2479
2480    #[test]
2481    fn workflow_validation_rejects_condition_without_true_false_edges() {
2482        let graph = WorkflowGraph {
2483            entry: "gate".to_string(),
2484            nodes: BTreeMap::from([(
2485                "gate".to_string(),
2486                WorkflowNode {
2487                    id: Some("gate".to_string()),
2488                    kind: "condition".to_string(),
2489                    ..Default::default()
2490                },
2491            )]),
2492            edges: vec![WorkflowEdge {
2493                from: "gate".to_string(),
2494                to: "next".to_string(),
2495                branch: Some("true".to_string()),
2496                label: None,
2497            }],
2498            ..Default::default()
2499        };
2500        let report = validate_workflow(&graph, None);
2501        assert!(!report.valid);
2502        assert!(report
2503            .errors
2504            .iter()
2505            .any(|error| error.contains("true") && error.contains("false")));
2506    }
2507
2508    #[test]
2509    fn replay_fixture_round_trip_passes() {
2510        let run = RunRecord {
2511            type_name: "run_record".to_string(),
2512            id: "run_1".to_string(),
2513            workflow_id: "wf".to_string(),
2514            workflow_name: Some("demo".to_string()),
2515            task: "demo".to_string(),
2516            status: "completed".to_string(),
2517            started_at: "1".to_string(),
2518            finished_at: Some("2".to_string()),
2519            parent_run_id: None,
2520            root_run_id: Some("run_1".to_string()),
2521            stages: vec![RunStageRecord {
2522                id: "stage_1".to_string(),
2523                node_id: "act".to_string(),
2524                kind: "stage".to_string(),
2525                status: "completed".to_string(),
2526                outcome: "success".to_string(),
2527                branch: Some("success".to_string()),
2528                started_at: "1".to_string(),
2529                finished_at: Some("2".to_string()),
2530                visible_text: Some("done".to_string()),
2531                private_reasoning: None,
2532                transcript: None,
2533                verification: None,
2534                artifacts: vec![ArtifactRecord {
2535                    type_name: "artifact".to_string(),
2536                    id: "a1".to_string(),
2537                    kind: "summary".to_string(),
2538                    text: Some("done".to_string()),
2539                    created_at: "1".to_string(),
2540                    ..Default::default()
2541                }
2542                .normalize()],
2543                consumed_artifact_ids: vec![],
2544                produced_artifact_ids: vec!["a1".to_string()],
2545                attempts: vec![],
2546                metadata: BTreeMap::new(),
2547            }],
2548            transitions: vec![],
2549            checkpoints: vec![],
2550            pending_nodes: vec![],
2551            completed_nodes: vec!["act".to_string()],
2552            child_runs: vec![],
2553            artifacts: vec![],
2554            policy: CapabilityPolicy::default(),
2555            execution: None,
2556            transcript: None,
2557            replay_fixture: None,
2558            metadata: BTreeMap::new(),
2559            persisted_path: None,
2560        };
2561        let fixture = replay_fixture_from_run(&run);
2562        let report = evaluate_run_against_fixture(&run, &fixture);
2563        assert!(report.pass);
2564        assert!(report.failures.is_empty());
2565    }
2566
2567    #[test]
2568    fn replay_eval_suite_reports_failed_case() {
2569        let good = RunRecord {
2570            id: "run_good".to_string(),
2571            workflow_id: "wf".to_string(),
2572            status: "completed".to_string(),
2573            stages: vec![RunStageRecord {
2574                node_id: "act".to_string(),
2575                status: "completed".to_string(),
2576                outcome: "success".to_string(),
2577                ..Default::default()
2578            }],
2579            ..Default::default()
2580        };
2581        let bad = RunRecord {
2582            id: "run_bad".to_string(),
2583            workflow_id: "wf".to_string(),
2584            status: "failed".to_string(),
2585            stages: vec![RunStageRecord {
2586                node_id: "act".to_string(),
2587                status: "failed".to_string(),
2588                outcome: "error".to_string(),
2589                ..Default::default()
2590            }],
2591            ..Default::default()
2592        };
2593        let suite = evaluate_run_suite(vec![
2594            (
2595                good.clone(),
2596                replay_fixture_from_run(&good),
2597                Some("good.json".to_string()),
2598            ),
2599            (
2600                bad.clone(),
2601                replay_fixture_from_run(&good),
2602                Some("bad.json".to_string()),
2603            ),
2604        ]);
2605        assert!(!suite.pass);
2606        assert_eq!(suite.total, 2);
2607        assert_eq!(suite.failed, 1);
2608        assert!(suite.cases.iter().any(|case| !case.pass));
2609    }
2610
2611    #[test]
2612    fn run_diff_reports_changed_stage() {
2613        let left = RunRecord {
2614            id: "left".to_string(),
2615            workflow_id: "wf".to_string(),
2616            status: "completed".to_string(),
2617            stages: vec![RunStageRecord {
2618                node_id: "act".to_string(),
2619                status: "completed".to_string(),
2620                outcome: "success".to_string(),
2621                ..Default::default()
2622            }],
2623            ..Default::default()
2624        };
2625        let right = RunRecord {
2626            id: "right".to_string(),
2627            workflow_id: "wf".to_string(),
2628            status: "failed".to_string(),
2629            stages: vec![RunStageRecord {
2630                node_id: "act".to_string(),
2631                status: "failed".to_string(),
2632                outcome: "error".to_string(),
2633                ..Default::default()
2634            }],
2635            ..Default::default()
2636        };
2637        let diff = diff_run_records(&left, &right);
2638        assert!(diff.status_changed);
2639        assert!(!diff.identical);
2640        assert_eq!(diff.stage_diffs.len(), 1);
2641    }
2642
2643    #[test]
2644    fn eval_suite_manifest_can_fail_on_baseline_diff() {
2645        let temp_dir =
2646            std::env::temp_dir().join(format!("harn-eval-suite-{}", uuid::Uuid::now_v7()));
2647        std::fs::create_dir_all(&temp_dir).unwrap();
2648        let baseline_path = temp_dir.join("baseline.json");
2649        let candidate_path = temp_dir.join("candidate.json");
2650
2651        let baseline = RunRecord {
2652            id: "baseline".to_string(),
2653            workflow_id: "wf".to_string(),
2654            status: "completed".to_string(),
2655            stages: vec![RunStageRecord {
2656                node_id: "act".to_string(),
2657                status: "completed".to_string(),
2658                outcome: "success".to_string(),
2659                ..Default::default()
2660            }],
2661            ..Default::default()
2662        };
2663        let candidate = RunRecord {
2664            id: "candidate".to_string(),
2665            workflow_id: "wf".to_string(),
2666            status: "failed".to_string(),
2667            stages: vec![RunStageRecord {
2668                node_id: "act".to_string(),
2669                status: "failed".to_string(),
2670                outcome: "error".to_string(),
2671                ..Default::default()
2672            }],
2673            ..Default::default()
2674        };
2675
2676        save_run_record(&baseline, Some(baseline_path.to_str().unwrap())).unwrap();
2677        save_run_record(&candidate, Some(candidate_path.to_str().unwrap())).unwrap();
2678
2679        let manifest = EvalSuiteManifest {
2680            base_dir: Some(temp_dir.display().to_string()),
2681            cases: vec![EvalSuiteCase {
2682                label: Some("candidate".to_string()),
2683                run_path: "candidate.json".to_string(),
2684                fixture_path: None,
2685                compare_to: Some("baseline.json".to_string()),
2686            }],
2687            ..Default::default()
2688        };
2689        let suite = evaluate_run_suite_manifest(&manifest).unwrap();
2690        assert!(!suite.pass);
2691        assert_eq!(suite.failed, 1);
2692        assert!(suite.cases[0].comparison.is_some());
2693        assert!(suite.cases[0]
2694            .failures
2695            .iter()
2696            .any(|failure| failure.contains("baseline")));
2697    }
2698
2699    #[test]
2700    fn render_unified_diff_marks_removed_and_added_lines() {
2701        let diff = render_unified_diff(Some("src/main.rs"), "old\nsame", "new\nsame");
2702        assert!(diff.contains("--- a/src/main.rs"));
2703        assert!(diff.contains("+++ b/src/main.rs"));
2704        assert!(diff.contains("-old"));
2705        assert!(diff.contains("+new"));
2706        assert!(diff.contains(" same"));
2707    }
2708
2709    #[test]
2710    fn execution_policy_rejects_process_exec_when_read_only() {
2711        push_execution_policy(CapabilityPolicy {
2712            side_effect_level: Some("read_only".to_string()),
2713            capabilities: BTreeMap::from([("process".to_string(), vec!["exec".to_string()])]),
2714            ..Default::default()
2715        });
2716        let result = enforce_current_policy_for_builtin("exec", &[]);
2717        pop_execution_policy();
2718        assert!(result.is_err());
2719    }
2720
2721    #[test]
2722    fn execution_policy_rejects_unlisted_tool() {
2723        push_execution_policy(CapabilityPolicy {
2724            tools: vec!["read".to_string()],
2725            ..Default::default()
2726        });
2727        let result = enforce_current_policy_for_tool("edit");
2728        pop_execution_policy();
2729        assert!(result.is_err());
2730    }
2731
2732    // ── Tool hook tests ──────────────────────────────────────────────
2733
2734    #[test]
2735    fn pre_tool_hook_deny_blocks_execution() {
2736        clear_tool_hooks();
2737        register_tool_hook(ToolHook {
2738            pattern: "dangerous_*".to_string(),
2739            pre: Some(Rc::new(|_name, _args| {
2740                PreToolAction::Deny("blocked by policy".to_string())
2741            })),
2742            post: None,
2743        });
2744        let result = run_pre_tool_hooks("dangerous_delete", &serde_json::json!({}));
2745        clear_tool_hooks();
2746        assert!(matches!(result, PreToolAction::Deny(_)));
2747    }
2748
2749    #[test]
2750    fn pre_tool_hook_allow_passes_through() {
2751        clear_tool_hooks();
2752        register_tool_hook(ToolHook {
2753            pattern: "safe_*".to_string(),
2754            pre: Some(Rc::new(|_name, _args| PreToolAction::Allow)),
2755            post: None,
2756        });
2757        let result = run_pre_tool_hooks("safe_read", &serde_json::json!({}));
2758        clear_tool_hooks();
2759        assert!(matches!(result, PreToolAction::Allow));
2760    }
2761
2762    #[test]
2763    fn pre_tool_hook_modify_rewrites_args() {
2764        clear_tool_hooks();
2765        register_tool_hook(ToolHook {
2766            pattern: "*".to_string(),
2767            pre: Some(Rc::new(|_name, _args| {
2768                PreToolAction::Modify(serde_json::json!({"path": "/sanitized"}))
2769            })),
2770            post: None,
2771        });
2772        let result = run_pre_tool_hooks("read_file", &serde_json::json!({"path": "/etc/passwd"}));
2773        clear_tool_hooks();
2774        match result {
2775            PreToolAction::Modify(args) => assert_eq!(args["path"], "/sanitized"),
2776            _ => panic!("expected Modify"),
2777        }
2778    }
2779
2780    #[test]
2781    fn post_tool_hook_modifies_result() {
2782        clear_tool_hooks();
2783        register_tool_hook(ToolHook {
2784            pattern: "exec".to_string(),
2785            pre: None,
2786            post: Some(Rc::new(|_name, result| {
2787                if result.contains("SECRET") {
2788                    PostToolAction::Modify("[REDACTED]".to_string())
2789                } else {
2790                    PostToolAction::Pass
2791                }
2792            })),
2793        });
2794        let result = run_post_tool_hooks("exec", "output with SECRET data");
2795        let clean = run_post_tool_hooks("exec", "clean output");
2796        clear_tool_hooks();
2797        assert_eq!(result, "[REDACTED]");
2798        assert_eq!(clean, "clean output");
2799    }
2800
2801    #[test]
2802    fn unmatched_hook_pattern_does_not_fire() {
2803        clear_tool_hooks();
2804        register_tool_hook(ToolHook {
2805            pattern: "exec".to_string(),
2806            pre: Some(Rc::new(|_name, _args| {
2807                PreToolAction::Deny("should not match".to_string())
2808            })),
2809            post: None,
2810        });
2811        let result = run_pre_tool_hooks("read_file", &serde_json::json!({}));
2812        clear_tool_hooks();
2813        assert!(matches!(result, PreToolAction::Allow));
2814    }
2815
2816    #[test]
2817    fn glob_match_patterns() {
2818        assert!(glob_match("*", "anything"));
2819        assert!(glob_match("exec*", "exec_at"));
2820        assert!(glob_match("*_file", "read_file"));
2821        assert!(!glob_match("exec*", "read_file"));
2822        assert!(glob_match("read_file", "read_file"));
2823        assert!(!glob_match("read_file", "write_file"));
2824    }
2825
2826    // ── Auto-compaction tests ────────────────────────────────────────
2827
2828    #[test]
2829    fn microcompact_snips_large_output() {
2830        let large = "x".repeat(50_000);
2831        let result = microcompact_tool_output(&large, 10_000);
2832        assert!(result.len() < 15_000);
2833        assert!(result.contains("snipped"));
2834    }
2835
2836    #[test]
2837    fn microcompact_preserves_small_output() {
2838        let small = "hello world";
2839        let result = microcompact_tool_output(small, 10_000);
2840        assert_eq!(result, small);
2841    }
2842
2843    #[test]
2844    fn auto_compact_messages_reduces_count() {
2845        let mut messages: Vec<serde_json::Value> = (0..20)
2846            .map(|i| serde_json::json!({"role": "user", "content": format!("message {i}")}))
2847            .collect();
2848        let compacted = auto_compact_messages(&mut messages, 6);
2849        assert!(compacted);
2850        assert!(messages.len() <= 7); // 6 kept + 1 summary
2851        assert!(messages[0]["content"]
2852            .as_str()
2853            .unwrap()
2854            .contains("auto-compacted"));
2855    }
2856
2857    #[test]
2858    fn auto_compact_noop_when_under_threshold() {
2859        let mut messages: Vec<serde_json::Value> = (0..4)
2860            .map(|i| serde_json::json!({"role": "user", "content": format!("msg {i}")}))
2861            .collect();
2862        let compacted = auto_compact_messages(&mut messages, 6);
2863        assert!(!compacted);
2864        assert_eq!(messages.len(), 4);
2865    }
2866
2867    #[test]
2868    fn estimate_message_tokens_basic() {
2869        let messages = vec![
2870            serde_json::json!({"role": "user", "content": "a".repeat(400)}),
2871            serde_json::json!({"role": "assistant", "content": "b".repeat(400)}),
2872        ];
2873        let tokens = estimate_message_tokens(&messages);
2874        assert_eq!(tokens, 200); // 800 chars / 4
2875    }
2876
2877    // ── Artifact dedup and microcompaction tests ─────────────────────
2878
2879    #[test]
2880    fn dedup_artifacts_removes_duplicates() {
2881        let mut artifacts = vec![
2882            ArtifactRecord {
2883                id: "a1".to_string(),
2884                kind: "test".to_string(),
2885                text: Some("duplicate content".to_string()),
2886                ..Default::default()
2887            },
2888            ArtifactRecord {
2889                id: "a2".to_string(),
2890                kind: "test".to_string(),
2891                text: Some("duplicate content".to_string()),
2892                ..Default::default()
2893            },
2894            ArtifactRecord {
2895                id: "a3".to_string(),
2896                kind: "test".to_string(),
2897                text: Some("unique content".to_string()),
2898                ..Default::default()
2899            },
2900        ];
2901        dedup_artifacts(&mut artifacts);
2902        assert_eq!(artifacts.len(), 2);
2903    }
2904
2905    #[test]
2906    fn microcompact_artifact_snips_oversized() {
2907        let mut artifact = ArtifactRecord {
2908            id: "a1".to_string(),
2909            kind: "test".to_string(),
2910            text: Some("x".repeat(10_000)),
2911            estimated_tokens: Some(2_500),
2912            ..Default::default()
2913        };
2914        microcompact_artifact(&mut artifact, 500);
2915        assert!(artifact.text.as_ref().unwrap().len() < 5_000);
2916        assert_eq!(artifact.estimated_tokens, Some(500));
2917    }
2918
2919    // ── Tool argument constraint tests ───────────────────────────────
2920
2921    #[test]
2922    fn arg_constraint_allows_matching_pattern() {
2923        let policy = CapabilityPolicy {
2924            tool_arg_constraints: vec![ToolArgConstraint {
2925                tool: "exec".to_string(),
2926                arg_patterns: vec!["cargo *".to_string()],
2927            }],
2928            ..Default::default()
2929        };
2930        let result = enforce_tool_arg_constraints(
2931            &policy,
2932            "exec",
2933            &serde_json::json!({"command": "cargo test"}),
2934        );
2935        assert!(result.is_ok());
2936    }
2937
2938    #[test]
2939    fn arg_constraint_rejects_non_matching_pattern() {
2940        let policy = CapabilityPolicy {
2941            tool_arg_constraints: vec![ToolArgConstraint {
2942                tool: "exec".to_string(),
2943                arg_patterns: vec!["cargo *".to_string()],
2944            }],
2945            ..Default::default()
2946        };
2947        let result = enforce_tool_arg_constraints(
2948            &policy,
2949            "exec",
2950            &serde_json::json!({"command": "rm -rf /"}),
2951        );
2952        assert!(result.is_err());
2953    }
2954
2955    #[test]
2956    fn arg_constraint_ignores_unmatched_tool() {
2957        let policy = CapabilityPolicy {
2958            tool_arg_constraints: vec![ToolArgConstraint {
2959                tool: "exec".to_string(),
2960                arg_patterns: vec!["cargo *".to_string()],
2961            }],
2962            ..Default::default()
2963        };
2964        let result = enforce_tool_arg_constraints(
2965            &policy,
2966            "read_file",
2967            &serde_json::json!({"path": "/etc/passwd"}),
2968        );
2969        assert!(result.is_ok());
2970    }
2971}
harn_vm/orchestration.rs

harn_vm/
orchestration.rs