harn_vm/
orchestration.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3use std::rc::Rc;
4use std::{cell::RefCell, thread_local};
5
6use serde::{Deserialize, Serialize};
7
8use crate::llm::{extract_llm_options, vm_call_llm_full, vm_value_to_json};
9use crate::value::{VmError, VmValue};
10
11fn now_rfc3339() -> String {
12    use std::time::{SystemTime, UNIX_EPOCH};
13    let ts = SystemTime::now()
14        .duration_since(UNIX_EPOCH)
15        .unwrap_or_default()
16        .as_secs();
17    format!("{ts}")
18}
19
20fn new_id(prefix: &str) -> String {
21    format!("{prefix}_{}", uuid::Uuid::now_v7())
22}
23
24fn default_run_dir() -> PathBuf {
25    std::env::var("HARN_RUN_DIR")
26        .map(PathBuf::from)
27        .unwrap_or_else(|_| PathBuf::from(".harn-runs"))
28}
29
30thread_local! {
31    static EXECUTION_POLICY_STACK: RefCell<Vec<CapabilityPolicy>> = const { RefCell::new(Vec::new()) };
32    static TOOL_HOOKS: RefCell<Vec<ToolHook>> = const { RefCell::new(Vec::new()) };
33    static CURRENT_MUTATION_SESSION: RefCell<Option<MutationSessionRecord>> = const { RefCell::new(None) };
34}
35
36#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
37#[serde(default)]
38pub struct MutationSessionRecord {
39    pub session_id: String,
40    pub parent_session_id: Option<String>,
41    pub run_id: Option<String>,
42    pub worker_id: Option<String>,
43    pub execution_kind: Option<String>,
44    pub mutation_scope: String,
45    pub approval_mode: String,
46}
47
48impl MutationSessionRecord {
49    pub fn normalize(mut self) -> Self {
50        if self.session_id.is_empty() {
51            self.session_id = new_id("session");
52        }
53        if self.mutation_scope.is_empty() {
54            self.mutation_scope = "read_only".to_string();
55        }
56        if self.approval_mode.is_empty() {
57            self.approval_mode = "host_enforced".to_string();
58        }
59        self
60    }
61}
62
63pub fn install_current_mutation_session(session: Option<MutationSessionRecord>) {
64    CURRENT_MUTATION_SESSION.with(|slot| {
65        *slot.borrow_mut() = session.map(MutationSessionRecord::normalize);
66    });
67}
68
69pub fn current_mutation_session() -> Option<MutationSessionRecord> {
70    CURRENT_MUTATION_SESSION.with(|slot| slot.borrow().clone())
71}
72
73// ── Tool lifecycle hooks ──────────────────────────────────────────────
74
75/// Action returned by a PreToolUse hook.
76#[derive(Clone, Debug)]
77pub enum PreToolAction {
78    /// Allow the tool call to proceed unchanged.
79    Allow,
80    /// Deny the tool call with an explanation.
81    Deny(String),
82    /// Allow but replace the arguments.
83    Modify(serde_json::Value),
84}
85
86/// Action returned by a PostToolUse hook.
87#[derive(Clone, Debug)]
88pub enum PostToolAction {
89    /// Pass the result through unchanged.
90    Pass,
91    /// Replace the result text.
92    Modify(String),
93}
94
95/// Callback types for tool lifecycle hooks.
96pub type PreToolHookFn = Rc<dyn Fn(&str, &serde_json::Value) -> PreToolAction>;
97pub type PostToolHookFn = Rc<dyn Fn(&str, &str) -> PostToolAction>;
98
99/// A registered tool hook with a name pattern and callbacks.
100#[derive(Clone)]
101pub struct ToolHook {
102    /// Glob-style pattern matched against tool names (e.g. `"*"`, `"exec*"`, `"read_file"`).
103    pub pattern: String,
104    /// Called before tool execution. Return `Deny` to reject, `Modify` to rewrite args.
105    pub pre: Option<PreToolHookFn>,
106    /// Called after tool execution with the result text. Return `Modify` to rewrite.
107    pub post: Option<PostToolHookFn>,
108}
109
110impl std::fmt::Debug for ToolHook {
111    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112        f.debug_struct("ToolHook")
113            .field("pattern", &self.pattern)
114            .field("has_pre", &self.pre.is_some())
115            .field("has_post", &self.post.is_some())
116            .finish()
117    }
118}
119
120fn glob_match(pattern: &str, name: &str) -> bool {
121    if pattern == "*" {
122        return true;
123    }
124    if let Some(prefix) = pattern.strip_suffix('*') {
125        return name.starts_with(prefix);
126    }
127    if let Some(suffix) = pattern.strip_prefix('*') {
128        return name.ends_with(suffix);
129    }
130    pattern == name
131}
132
133pub fn register_tool_hook(hook: ToolHook) {
134    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().push(hook));
135}
136
137pub fn clear_tool_hooks() {
138    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().clear());
139}
140
141/// Run all matching PreToolUse hooks. Returns the final action.
142pub fn run_pre_tool_hooks(tool_name: &str, args: &serde_json::Value) -> PreToolAction {
143    TOOL_HOOKS.with(|hooks| {
144        let hooks = hooks.borrow();
145        let mut current_args = args.clone();
146        for hook in hooks.iter() {
147            if !glob_match(&hook.pattern, tool_name) {
148                continue;
149            }
150            if let Some(ref pre) = hook.pre {
151                match pre(tool_name, &current_args) {
152                    PreToolAction::Allow => {}
153                    PreToolAction::Deny(reason) => return PreToolAction::Deny(reason),
154                    PreToolAction::Modify(new_args) => {
155                        current_args = new_args;
156                    }
157                }
158            }
159        }
160        if current_args != *args {
161            PreToolAction::Modify(current_args)
162        } else {
163            PreToolAction::Allow
164        }
165    })
166}
167
168/// Run all matching PostToolUse hooks. Returns the (possibly modified) result.
169pub fn run_post_tool_hooks(tool_name: &str, result: &str) -> String {
170    TOOL_HOOKS.with(|hooks| {
171        let hooks = hooks.borrow();
172        let mut current = result.to_string();
173        for hook in hooks.iter() {
174            if !glob_match(&hook.pattern, tool_name) {
175                continue;
176            }
177            if let Some(ref post) = hook.post {
178                match post(tool_name, &current) {
179                    PostToolAction::Pass => {}
180                    PostToolAction::Modify(new_result) => {
181                        current = new_result;
182                    }
183                }
184            }
185        }
186        current
187    })
188}
189
190// ── Auto-compaction ───────────────────────────────────────────────────
191
192#[derive(Clone, Debug, PartialEq, Eq)]
193pub enum CompactStrategy {
194    Llm,
195    Truncate,
196    Custom,
197}
198
199pub fn parse_compact_strategy(value: &str) -> Result<CompactStrategy, VmError> {
200    match value {
201        "llm" => Ok(CompactStrategy::Llm),
202        "truncate" => Ok(CompactStrategy::Truncate),
203        "custom" => Ok(CompactStrategy::Custom),
204        other => Err(VmError::Runtime(format!(
205            "unknown compact_strategy '{other}' (expected 'llm', 'truncate', or 'custom')"
206        ))),
207    }
208}
209
210/// Configuration for automatic transcript compaction in agent loops.
211#[derive(Clone, Debug)]
212pub struct AutoCompactConfig {
213    /// Maximum estimated tokens before triggering auto-compaction.
214    pub token_threshold: usize,
215    /// Maximum character length for a single tool result before microcompaction.
216    pub tool_output_max_chars: usize,
217    /// Number of recent messages to keep during auto-compaction.
218    pub keep_last: usize,
219    /// Strategy used to summarize archived messages.
220    pub compact_strategy: CompactStrategy,
221    /// Optional Harn callback used when `compact_strategy` is `custom`.
222    pub custom_compactor: Option<VmValue>,
223}
224
225impl Default for AutoCompactConfig {
226    fn default() -> Self {
227        Self {
228            token_threshold: 80_000,
229            tool_output_max_chars: 20_000,
230            keep_last: 8,
231            compact_strategy: CompactStrategy::Llm,
232            custom_compactor: None,
233        }
234    }
235}
236
237/// Estimate token count from a list of JSON messages (chars / 4 heuristic).
238pub fn estimate_message_tokens(messages: &[serde_json::Value]) -> usize {
239    messages
240        .iter()
241        .map(|m| {
242            m.get("content")
243                .and_then(|c| c.as_str())
244                .map(|s| s.len())
245                .unwrap_or(0)
246        })
247        .sum::<usize>()
248        / 4
249}
250
251/// Microcompact a tool result: if it exceeds `max_chars`, keep the first and
252/// last portions with a snip marker in between.
253pub fn microcompact_tool_output(output: &str, max_chars: usize) -> String {
254    if output.len() <= max_chars || max_chars < 200 {
255        return output.to_string();
256    }
257    let diagnostic_lines = output
258        .lines()
259        .filter(|line| {
260            let trimmed = line.trim();
261            let lower = trimmed.to_lowercase();
262            // file:line pattern (e.g. "src/main.rs:42:" or "foo.go:10:5:")
263            let has_file_line = {
264                let bytes = trimmed.as_bytes();
265                let mut i = 0;
266                let mut found_colon = false;
267                while i < bytes.len() {
268                    if bytes[i] == b':' {
269                        found_colon = true;
270                        break;
271                    }
272                    i += 1;
273                }
274                found_colon && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit()
275            };
276            // Generic keyword classification. Split into "strong" keywords
277            // whose presence alone signals a diagnostic line (regardless of
278            // whether the line also has a file:line reference), and "weak"
279            // keywords which only count as diagnostic when paired with a
280            // file:line (to avoid false positives on narrative prose that
281            // happens to contain the word "error" or "expected").
282            //
283            // These are deliberately generic — not tied to any specific
284            // language's test runner output format. Language-specific
285            // patterns (Go's "--- FAIL:", pytest's "FAILED tests/", Rust's
286            // "thread 'X' panicked at") should be supplied by the pipeline
287            // via the `extra_diagnostic_patterns` auto_compact option,
288            // which is where language/runner awareness belongs.
289            let has_strong_keyword =
290                trimmed.contains("FAIL") || trimmed.contains("panic") || trimmed.contains("Panic");
291            let has_weak_keyword = trimmed.contains("error")
292                || trimmed.contains("undefined")
293                || trimmed.contains("expected")
294                || trimmed.contains("got")
295                || lower.contains("cannot find")
296                || lower.contains("not found")
297                || lower.contains("no such")
298                || lower.contains("unresolved")
299                || lower.contains("missing")
300                || lower.contains("declared but not used")
301                || lower.contains("unused")
302                || lower.contains("mismatch");
303            let positional = lower.contains(" error ")
304                || lower.starts_with("error:")
305                || lower.starts_with("warning:")
306                || lower.starts_with("note:")
307                || lower.contains("panic:");
308            has_strong_keyword || (has_file_line && has_weak_keyword) || positional
309        })
310        .take(32)
311        .collect::<Vec<_>>();
312    if !diagnostic_lines.is_empty() {
313        let diagnostics = diagnostic_lines.join("\n");
314        let budget = max_chars.saturating_sub(diagnostics.len() + 64);
315        let keep = budget / 2;
316        if keep >= 80 && output.len() > keep * 2 {
317            let head_end = output.floor_char_boundary(keep);
318            let tail_start = output.ceil_char_boundary(output.len() - keep);
319            let head = &output[..head_end];
320            let tail = &output[tail_start..];
321            return format!(
322                "{head}\n\n[diagnostic lines preserved]\n{diagnostics}\n\n[... output compacted ...]\n\n{tail}"
323            );
324        }
325    }
326    let keep = max_chars / 2;
327    let head_end = output.floor_char_boundary(keep);
328    let tail_start = output.ceil_char_boundary(output.len() - keep);
329    let head = &output[..head_end];
330    let tail = &output[tail_start..];
331    let snipped = output.len() - max_chars;
332    format!("{head}\n\n[... {snipped} characters snipped ...]\n\n{tail}")
333}
334
335fn format_compaction_messages(messages: &[serde_json::Value]) -> String {
336    messages
337        .iter()
338        .map(|msg| {
339            let role = msg
340                .get("role")
341                .and_then(|v| v.as_str())
342                .unwrap_or("user")
343                .to_uppercase();
344            let content = msg
345                .get("content")
346                .and_then(|v| v.as_str())
347                .unwrap_or_default();
348            format!("{role}: {content}")
349        })
350        .collect::<Vec<_>>()
351        .join("\n")
352}
353
354fn truncate_compaction_summary(
355    old_messages: &[serde_json::Value],
356    archived_count: usize,
357) -> String {
358    truncate_compaction_summary_with_context(old_messages, archived_count, false)
359}
360
361fn truncate_compaction_summary_with_context(
362    old_messages: &[serde_json::Value],
363    archived_count: usize,
364    is_llm_fallback: bool,
365) -> String {
366    let per_msg_limit = 500_usize;
367    let summary_parts: Vec<String> = old_messages
368        .iter()
369        .filter_map(|m| {
370            let role = m.get("role")?.as_str()?;
371            let content = m.get("content")?.as_str()?;
372            if content.is_empty() {
373                return None;
374            }
375            let truncated = if content.len() > per_msg_limit {
376                format!(
377                    "{}... [truncated from {} chars]",
378                    &content[..content.floor_char_boundary(per_msg_limit)],
379                    content.len()
380                )
381            } else {
382                content.to_string()
383            };
384            Some(format!("[{role}] {truncated}"))
385        })
386        .take(15)
387        .collect();
388    let header = if is_llm_fallback {
389        format!(
390            "[auto-compact fallback: LLM summarizer returned empty; {archived_count} older messages abbreviated to ~{per_msg_limit} chars each]"
391        )
392    } else {
393        format!("[auto-compacted {archived_count} older messages via truncate strategy]")
394    };
395    format!(
396        "{header}\n{}{}",
397        summary_parts.join("\n"),
398        if archived_count > 15 {
399            format!("\n... and {} more", archived_count - 15)
400        } else {
401            String::new()
402        }
403    )
404}
405
406fn compact_summary_text_from_value(value: &VmValue) -> Result<String, VmError> {
407    if let Some(map) = value.as_dict() {
408        if let Some(summary) = map.get("summary").or_else(|| map.get("text")) {
409            return Ok(summary.display());
410        }
411    }
412    match value {
413        VmValue::String(text) => Ok(text.to_string()),
414        VmValue::Nil => Ok(String::new()),
415        _ => serde_json::to_string_pretty(&vm_value_to_json(value))
416            .map_err(|e| VmError::Runtime(format!("custom compactor encode error: {e}"))),
417    }
418}
419
420async fn llm_compaction_summary(
421    old_messages: &[serde_json::Value],
422    archived_count: usize,
423    llm_opts: &crate::llm::api::LlmCallOptions,
424) -> Result<String, VmError> {
425    let mut compact_opts = llm_opts.clone();
426    let formatted = format_compaction_messages(old_messages);
427    compact_opts.system = None;
428    compact_opts.transcript_id = None;
429    compact_opts.transcript_summary = None;
430    compact_opts.transcript_metadata = None;
431    compact_opts.native_tools = None;
432    compact_opts.tool_choice = None;
433    compact_opts.response_format = None;
434    compact_opts.json_schema = None;
435    compact_opts.messages = vec![serde_json::json!({
436        "role": "user",
437        "content": format!(
438            "Summarize these archived conversation messages for a follow-on coding agent. Preserve goals, constraints, decisions, completed tool work, unresolved issues, and next actions. Output only the summary text.\n\nArchived message count: {archived_count}\n\nConversation:\n{formatted}"
439        ),
440    })];
441    let result = vm_call_llm_full(&compact_opts).await?;
442    let summary = result.text.trim();
443    if summary.is_empty() {
444        Ok(truncate_compaction_summary_with_context(
445            old_messages,
446            archived_count,
447            true,
448        ))
449    } else {
450        Ok(format!(
451            "[auto-compacted {archived_count} older messages]\n{summary}"
452        ))
453    }
454}
455
456async fn custom_compaction_summary(
457    old_messages: &[serde_json::Value],
458    archived_count: usize,
459    callback: &VmValue,
460) -> Result<String, VmError> {
461    let Some(VmValue::Closure(closure)) = Some(callback.clone()) else {
462        return Err(VmError::Runtime(
463            "compact_callback must be a closure when compact_strategy is 'custom'".to_string(),
464        ));
465    };
466    let mut vm = crate::vm::clone_async_builtin_child_vm().ok_or_else(|| {
467        VmError::Runtime(
468            "custom transcript compaction requires an async builtin VM context".to_string(),
469        )
470    })?;
471    let messages_vm = VmValue::List(Rc::new(
472        old_messages
473            .iter()
474            .map(crate::stdlib::json_to_vm_value)
475            .collect(),
476    ));
477    let result = vm.call_closure_pub(&closure, &[messages_vm], &[]).await;
478    let summary = compact_summary_text_from_value(&result?)?;
479    if summary.trim().is_empty() {
480        Ok(truncate_compaction_summary(old_messages, archived_count))
481    } else {
482        Ok(format!(
483            "[auto-compacted {archived_count} older messages]\n{summary}"
484        ))
485    }
486}
487
488/// Auto-compact a message list in place: summarize older messages into a
489/// note and keep the most recent `keep_last` messages.
490pub(crate) async fn auto_compact_messages(
491    messages: &mut Vec<serde_json::Value>,
492    config: &AutoCompactConfig,
493    llm_opts: Option<&crate::llm::api::LlmCallOptions>,
494) -> Result<Option<String>, VmError> {
495    if messages.len() <= config.keep_last {
496        return Ok(None);
497    }
498    let split_at = messages.len().saturating_sub(config.keep_last);
499    let old_messages: Vec<_> = messages.drain(..split_at).collect();
500    let archived_count = old_messages.len();
501    let summary = match config.compact_strategy {
502        CompactStrategy::Truncate => truncate_compaction_summary(&old_messages, archived_count),
503        CompactStrategy::Llm => {
504            llm_compaction_summary(
505                &old_messages,
506                archived_count,
507                llm_opts.ok_or_else(|| {
508                    VmError::Runtime(
509                        "LLM transcript compaction requires active LLM call options".to_string(),
510                    )
511                })?,
512            )
513            .await?
514        }
515        CompactStrategy::Custom => {
516            custom_compaction_summary(
517                &old_messages,
518                archived_count,
519                config.custom_compactor.as_ref().ok_or_else(|| {
520                    VmError::Runtime(
521                        "compact_callback is required when compact_strategy is 'custom'"
522                            .to_string(),
523                    )
524                })?,
525            )
526            .await?
527        }
528    };
529    messages.insert(
530        0,
531        serde_json::json!({
532            "role": "user",
533            "content": summary,
534        }),
535    );
536    Ok(Some(summary))
537}
538
539// ── Adaptive context assembly ─────────────────────────────────────────
540
541/// Snip an artifact's text to fit within a token budget.
542pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
543    let max_chars = max_tokens * 4;
544    if let Some(ref text) = artifact.text {
545        if text.len() > max_chars && max_chars >= 200 {
546            artifact.text = Some(microcompact_tool_output(text, max_chars));
547            artifact.estimated_tokens = Some(max_tokens);
548        }
549    }
550}
551
552/// Deduplicate artifacts by removing those with identical text content,
553/// keeping the one with higher priority.
554pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
555    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
556    artifacts.retain(|artifact| {
557        let text = artifact.text.as_deref().unwrap_or("");
558        if text.is_empty() {
559            return true;
560        }
561        // Simple hash for dedup
562        let hash = {
563            use std::hash::{Hash, Hasher};
564            let mut hasher = std::collections::hash_map::DefaultHasher::new();
565            text.hash(&mut hasher);
566            hasher.finish()
567        };
568        seen_hashes.insert(hash)
569    });
570}
571
572/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
573/// then delegate to the standard `select_artifacts`.
574pub fn select_artifacts_adaptive(
575    mut artifacts: Vec<ArtifactRecord>,
576    policy: &ContextPolicy,
577) -> Vec<ArtifactRecord> {
578    // Phase 1: deduplicate
579    dedup_artifacts(&mut artifacts);
580
581    // Phase 2: microcompact oversized artifacts relative to budget.
582    // Cap individual artifacts to a fraction of the total budget, but don't
583    // let the per-artifact cap exceed the total budget (avoid overrun).
584    if let Some(max_tokens) = policy.max_tokens {
585        let count = artifacts.len().max(1);
586        let per_artifact_budget = max_tokens / count;
587        // Floor of 500 tokens, but never more than total budget
588        let cap = per_artifact_budget.max(500).min(max_tokens);
589        for artifact in &mut artifacts {
590            let est = artifact.estimated_tokens.unwrap_or(0);
591            if est > cap * 2 {
592                microcompact_artifact(artifact, cap);
593            }
594        }
595    }
596
597    // Phase 3: standard selection with budget
598    select_artifacts(artifacts, policy)
599}
600
601// ── Per-agent policy with argument patterns ───────────────────────────
602
603/// Extended policy that supports argument-level constraints.
604#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
605#[serde(default)]
606pub struct ToolArgConstraint {
607    /// Tool name to constrain.
608    pub tool: String,
609    /// Glob patterns that the first string argument must match.
610    /// If empty, no argument constraint is applied.
611    pub arg_patterns: Vec<String>,
612}
613
614/// Check if a tool call satisfies argument constraints in the policy.
615pub fn enforce_tool_arg_constraints(
616    policy: &CapabilityPolicy,
617    tool_name: &str,
618    args: &serde_json::Value,
619) -> Result<(), VmError> {
620    for constraint in &policy.tool_arg_constraints {
621        if !glob_match(&constraint.tool, tool_name) {
622            continue;
623        }
624        if constraint.arg_patterns.is_empty() {
625            continue;
626        }
627        // Extract the first string-like argument for pattern matching
628        let first_arg = args
629            .as_object()
630            .and_then(|o| o.values().next())
631            .and_then(|v| v.as_str())
632            .or_else(|| args.as_str())
633            .unwrap_or("");
634        let matches = constraint
635            .arg_patterns
636            .iter()
637            .any(|pattern| glob_match(pattern, first_arg));
638        if !matches {
639            return reject_policy(format!(
640                "tool '{tool_name}' argument '{first_arg}' does not match allowed patterns: {:?}",
641                constraint.arg_patterns
642            ));
643        }
644    }
645    Ok(())
646}
647
648fn normalize_artifact_kind(kind: &str) -> String {
649    match kind {
650        "resource"
651        | "workspace_file"
652        | "editor_selection"
653        | "workspace_snapshot"
654        | "transcript_summary"
655        | "summary"
656        | "plan"
657        | "diff"
658        | "git_diff"
659        | "patch"
660        | "patch_set"
661        | "patch_proposal"
662        | "diff_review"
663        | "review_decision"
664        | "verification_bundle"
665        | "apply_intent"
666        | "verification_result"
667        | "test_result"
668        | "command_result"
669        | "provider_payload"
670        | "worker_result"
671        | "worker_notification"
672        | "artifact" => kind.to_string(),
673        "file" => "workspace_file".to_string(),
674        "transcript" => "transcript_summary".to_string(),
675        "verification" => "verification_result".to_string(),
676        "test" => "test_result".to_string(),
677        other if other.trim().is_empty() => "artifact".to_string(),
678        other => other.to_string(),
679    }
680}
681
682fn default_artifact_priority(kind: &str) -> i64 {
683    match kind {
684        "verification_result" | "test_result" => 100,
685        "verification_bundle" => 95,
686        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
687        | "review_decision" | "apply_intent" => 90,
688        "plan" => 80,
689        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
690        "summary" | "transcript_summary" => 60,
691        "command_result" => 50,
692        _ => 40,
693    }
694}
695
696fn freshness_rank(value: Option<&str>) -> i64 {
697    match value.unwrap_or_default() {
698        "fresh" | "live" => 3,
699        "recent" => 2,
700        "stale" => 0,
701        _ => 1,
702    }
703}
704
705#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
706#[serde(default)]
707pub struct ToolRuntimePolicyMetadata {
708    pub capabilities: BTreeMap<String, Vec<String>>,
709    pub side_effect_level: Option<String>,
710    pub path_params: Vec<String>,
711    pub mutation_classification: Option<String>,
712}
713
714#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
715#[serde(default)]
716pub struct CapabilityPolicy {
717    pub tools: Vec<String>,
718    pub capabilities: BTreeMap<String, Vec<String>>,
719    pub workspace_roots: Vec<String>,
720    pub side_effect_level: Option<String>,
721    pub recursion_limit: Option<usize>,
722    /// Argument-level constraints for specific tools.
723    #[serde(default)]
724    pub tool_arg_constraints: Vec<ToolArgConstraint>,
725    #[serde(default)]
726    pub tool_metadata: BTreeMap<String, ToolRuntimePolicyMetadata>,
727}
728
729impl CapabilityPolicy {
730    pub fn intersect(&self, requested: &CapabilityPolicy) -> Result<CapabilityPolicy, String> {
731        let side_effect_level = match (&self.side_effect_level, &requested.side_effect_level) {
732            (Some(a), Some(b)) => Some(min_side_effect(a, b).to_string()),
733            (Some(a), None) => Some(a.clone()),
734            (None, Some(b)) => Some(b.clone()),
735            (None, None) => None,
736        };
737
738        if !self.tools.is_empty() {
739            let denied: Vec<String> = requested
740                .tools
741                .iter()
742                .filter(|tool| !self.tools.contains(*tool))
743                .cloned()
744                .collect();
745            if !denied.is_empty() {
746                return Err(format!(
747                    "requested tools exceed host ceiling: {}",
748                    denied.join(", ")
749                ));
750            }
751        }
752
753        for (capability, requested_ops) in &requested.capabilities {
754            if let Some(allowed_ops) = self.capabilities.get(capability) {
755                let denied: Vec<String> = requested_ops
756                    .iter()
757                    .filter(|op| !allowed_ops.contains(*op))
758                    .cloned()
759                    .collect();
760                if !denied.is_empty() {
761                    return Err(format!(
762                        "requested capability operations exceed host ceiling: {}.{}",
763                        capability,
764                        denied.join(",")
765                    ));
766                }
767            } else if !self.capabilities.is_empty() {
768                return Err(format!(
769                    "requested capability exceeds host ceiling: {capability}"
770                ));
771            }
772        }
773
774        let tools = if self.tools.is_empty() {
775            requested.tools.clone()
776        } else if requested.tools.is_empty() {
777            self.tools.clone()
778        } else {
779            requested
780                .tools
781                .iter()
782                .filter(|tool| self.tools.contains(*tool))
783                .cloned()
784                .collect()
785        };
786
787        let capabilities = if self.capabilities.is_empty() {
788            requested.capabilities.clone()
789        } else if requested.capabilities.is_empty() {
790            self.capabilities.clone()
791        } else {
792            requested
793                .capabilities
794                .iter()
795                .filter_map(|(capability, requested_ops)| {
796                    self.capabilities.get(capability).map(|allowed_ops| {
797                        (
798                            capability.clone(),
799                            requested_ops
800                                .iter()
801                                .filter(|op| allowed_ops.contains(*op))
802                                .cloned()
803                                .collect::<Vec<_>>(),
804                        )
805                    })
806                })
807                .collect()
808        };
809
810        let workspace_roots = if self.workspace_roots.is_empty() {
811            requested.workspace_roots.clone()
812        } else if requested.workspace_roots.is_empty() {
813            self.workspace_roots.clone()
814        } else {
815            requested
816                .workspace_roots
817                .iter()
818                .filter(|root| self.workspace_roots.contains(*root))
819                .cloned()
820                .collect()
821        };
822
823        let recursion_limit = match (self.recursion_limit, requested.recursion_limit) {
824            (Some(a), Some(b)) => Some(a.min(b)),
825            (Some(a), None) => Some(a),
826            (None, Some(b)) => Some(b),
827            (None, None) => None,
828        };
829
830        // Merge arg constraints from both sides
831        let mut tool_arg_constraints = self.tool_arg_constraints.clone();
832        tool_arg_constraints.extend(requested.tool_arg_constraints.clone());
833
834        let tool_metadata = tools
835            .iter()
836            .filter_map(|tool| {
837                requested
838                    .tool_metadata
839                    .get(tool)
840                    .or_else(|| self.tool_metadata.get(tool))
841                    .cloned()
842                    .map(|metadata| (tool.clone(), metadata))
843            })
844            .collect();
845
846        Ok(CapabilityPolicy {
847            tools,
848            capabilities,
849            workspace_roots,
850            side_effect_level,
851            recursion_limit,
852            tool_arg_constraints,
853            tool_metadata,
854        })
855    }
856}
857
858fn min_side_effect<'a>(a: &'a str, b: &'a str) -> &'a str {
859    fn rank(v: &str) -> usize {
860        match v {
861            "none" => 0,
862            "read_only" => 1,
863            "workspace_write" => 2,
864            "process_exec" => 3,
865            "network" => 4,
866            _ => 5,
867        }
868    }
869    if rank(a) <= rank(b) {
870        a
871    } else {
872        b
873    }
874}
875
876#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
877#[serde(default)]
878pub struct ModelPolicy {
879    pub provider: Option<String>,
880    pub model: Option<String>,
881    pub model_tier: Option<String>,
882    pub temperature: Option<f64>,
883    pub max_tokens: Option<i64>,
884}
885
886#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
887#[serde(default)]
888pub struct TranscriptPolicy {
889    pub mode: Option<String>,
890    pub visibility: Option<String>,
891    pub summarize: bool,
892    pub compact: bool,
893    pub keep_last: Option<usize>,
894}
895
896#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
897#[serde(default)]
898pub struct ContextPolicy {
899    pub max_artifacts: Option<usize>,
900    pub max_tokens: Option<usize>,
901    pub reserve_tokens: Option<usize>,
902    pub include_kinds: Vec<String>,
903    pub exclude_kinds: Vec<String>,
904    pub prioritize_kinds: Vec<String>,
905    pub pinned_ids: Vec<String>,
906    pub include_stages: Vec<String>,
907    pub prefer_recent: bool,
908    pub prefer_fresh: bool,
909    pub render: Option<String>,
910}
911
912#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
913#[serde(default)]
914pub struct RetryPolicy {
915    pub max_attempts: usize,
916    pub verify: bool,
917    pub repair: bool,
918}
919
920#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
921#[serde(default)]
922pub struct StageContract {
923    pub input_kinds: Vec<String>,
924    pub output_kinds: Vec<String>,
925    pub min_inputs: Option<usize>,
926    pub max_inputs: Option<usize>,
927    pub require_transcript: bool,
928    pub schema: Option<serde_json::Value>,
929}
930
931#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
932#[serde(default)]
933pub struct BranchSemantics {
934    pub success: Option<String>,
935    pub failure: Option<String>,
936    pub verify_pass: Option<String>,
937    pub verify_fail: Option<String>,
938    pub condition_true: Option<String>,
939    pub condition_false: Option<String>,
940    pub loop_continue: Option<String>,
941    pub loop_exit: Option<String>,
942    pub escalation: Option<String>,
943}
944
945#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
946#[serde(default)]
947pub struct MapPolicy {
948    pub items: Vec<serde_json::Value>,
949    pub item_artifact_kind: Option<String>,
950    pub output_kind: Option<String>,
951    pub max_items: Option<usize>,
952}
953
954#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
955#[serde(default)]
956pub struct JoinPolicy {
957    pub strategy: String,
958    pub require_all_inputs: bool,
959    pub min_completed: Option<usize>,
960}
961
962#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
963#[serde(default)]
964pub struct ReducePolicy {
965    pub strategy: String,
966    pub separator: Option<String>,
967    pub output_kind: Option<String>,
968}
969
970#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
971#[serde(default)]
972pub struct EscalationPolicy {
973    pub level: Option<String>,
974    pub queue: Option<String>,
975    pub reason: Option<String>,
976}
977
978#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
979#[serde(default)]
980pub struct ArtifactRecord {
981    #[serde(rename = "_type")]
982    pub type_name: String,
983    pub id: String,
984    pub kind: String,
985    pub title: Option<String>,
986    pub text: Option<String>,
987    pub data: Option<serde_json::Value>,
988    pub source: Option<String>,
989    pub created_at: String,
990    pub freshness: Option<String>,
991    pub priority: Option<i64>,
992    pub lineage: Vec<String>,
993    pub relevance: Option<f64>,
994    pub estimated_tokens: Option<usize>,
995    pub stage: Option<String>,
996    pub metadata: BTreeMap<String, serde_json::Value>,
997}
998
999impl ArtifactRecord {
1000    pub fn normalize(mut self) -> Self {
1001        if self.type_name.is_empty() {
1002            self.type_name = "artifact".to_string();
1003        }
1004        if self.id.is_empty() {
1005            self.id = new_id("artifact");
1006        }
1007        if self.created_at.is_empty() {
1008            self.created_at = now_rfc3339();
1009        }
1010        if self.kind.is_empty() {
1011            self.kind = "artifact".to_string();
1012        }
1013        self.kind = normalize_artifact_kind(&self.kind);
1014        if self.estimated_tokens.is_none() {
1015            self.estimated_tokens = self
1016                .text
1017                .as_ref()
1018                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
1019        }
1020        if self.priority.is_none() {
1021            self.priority = Some(default_artifact_priority(&self.kind));
1022        }
1023        self
1024    }
1025}
1026
1027#[derive(Clone, Debug, Default, Serialize, Deserialize)]
1028#[serde(default)]
1029pub struct WorkflowNode {
1030    pub id: Option<String>,
1031    pub kind: String,
1032    pub mode: Option<String>,
1033    pub prompt: Option<String>,
1034    pub system: Option<String>,
1035    pub task_label: Option<String>,
1036    pub done_sentinel: Option<String>,
1037    pub tools: serde_json::Value,
1038    pub model_policy: ModelPolicy,
1039    pub transcript_policy: TranscriptPolicy,
1040    pub context_policy: ContextPolicy,
1041    pub retry_policy: RetryPolicy,
1042    pub capability_policy: CapabilityPolicy,
1043    pub input_contract: StageContract,
1044    pub output_contract: StageContract,
1045    pub branch_semantics: BranchSemantics,
1046    pub map_policy: MapPolicy,
1047    pub join_policy: JoinPolicy,
1048    pub reduce_policy: ReducePolicy,
1049    pub escalation_policy: EscalationPolicy,
1050    pub verify: Option<serde_json::Value>,
1051    pub metadata: BTreeMap<String, serde_json::Value>,
1052    #[serde(skip)]
1053    pub raw_tools: Option<VmValue>,
1054}
1055
1056impl PartialEq for WorkflowNode {
1057    fn eq(&self, other: &Self) -> bool {
1058        serde_json::to_value(self).ok() == serde_json::to_value(other).ok()
1059    }
1060}
1061
1062pub fn workflow_tool_names(value: &serde_json::Value) -> Vec<String> {
1063    match value {
1064        serde_json::Value::Null => Vec::new(),
1065        serde_json::Value::Array(items) => items
1066            .iter()
1067            .filter_map(|item| match item {
1068                serde_json::Value::Object(map) => map
1069                    .get("name")
1070                    .and_then(|value| value.as_str())
1071                    .filter(|name| !name.is_empty())
1072                    .map(|name| name.to_string()),
1073                _ => None,
1074            })
1075            .collect(),
1076        serde_json::Value::Object(map) => {
1077            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1078                return map
1079                    .get("tools")
1080                    .map(workflow_tool_names)
1081                    .unwrap_or_default();
1082            }
1083            map.get("name")
1084                .and_then(|value| value.as_str())
1085                .filter(|name| !name.is_empty())
1086                .map(|name| vec![name.to_string()])
1087                .unwrap_or_default()
1088        }
1089        _ => Vec::new(),
1090    }
1091}
1092
1093fn max_side_effect_level(levels: impl Iterator<Item = String>) -> Option<String> {
1094    fn rank(v: &str) -> usize {
1095        match v {
1096            "none" => 0,
1097            "read_only" => 1,
1098            "workspace_write" => 2,
1099            "process_exec" => 3,
1100            "network" => 4,
1101            _ => 5,
1102        }
1103    }
1104    levels.max_by_key(|level| rank(level))
1105}
1106
1107fn parse_tool_runtime_policy(
1108    map: &serde_json::Map<String, serde_json::Value>,
1109) -> ToolRuntimePolicyMetadata {
1110    let Some(policy) = map.get("policy").and_then(|value| value.as_object()) else {
1111        return ToolRuntimePolicyMetadata::default();
1112    };
1113
1114    let capabilities = policy
1115        .get("capabilities")
1116        .and_then(|value| value.as_object())
1117        .map(|caps| {
1118            caps.iter()
1119                .map(|(capability, ops)| {
1120                    let values = ops
1121                        .as_array()
1122                        .map(|items| {
1123                            items
1124                                .iter()
1125                                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1126                                .collect::<Vec<_>>()
1127                        })
1128                        .unwrap_or_default();
1129                    (capability.clone(), values)
1130                })
1131                .collect::<BTreeMap<_, _>>()
1132        })
1133        .unwrap_or_default();
1134
1135    let path_params = policy
1136        .get("path_params")
1137        .and_then(|value| value.as_array())
1138        .map(|items| {
1139            items
1140                .iter()
1141                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1142                .collect::<Vec<_>>()
1143        })
1144        .unwrap_or_default();
1145
1146    ToolRuntimePolicyMetadata {
1147        capabilities,
1148        side_effect_level: policy
1149            .get("side_effect_level")
1150            .and_then(|value| value.as_str())
1151            .map(|s| s.to_string()),
1152        path_params,
1153        mutation_classification: policy
1154            .get("mutation_classification")
1155            .and_then(|value| value.as_str())
1156            .map(|s| s.to_string()),
1157    }
1158}
1159
1160pub fn workflow_tool_metadata(
1161    value: &serde_json::Value,
1162) -> BTreeMap<String, ToolRuntimePolicyMetadata> {
1163    match value {
1164        serde_json::Value::Null => BTreeMap::new(),
1165        serde_json::Value::Array(items) => items
1166            .iter()
1167            .filter_map(|item| match item {
1168                serde_json::Value::Object(map) => map
1169                    .get("name")
1170                    .and_then(|value| value.as_str())
1171                    .filter(|name| !name.is_empty())
1172                    .map(|name| (name.to_string(), parse_tool_runtime_policy(map))),
1173                _ => None,
1174            })
1175            .collect(),
1176        serde_json::Value::Object(map) => {
1177            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1178                return map
1179                    .get("tools")
1180                    .map(workflow_tool_metadata)
1181                    .unwrap_or_default();
1182            }
1183            map.get("name")
1184                .and_then(|value| value.as_str())
1185                .filter(|name| !name.is_empty())
1186                .map(|name| {
1187                    let mut metadata = BTreeMap::new();
1188                    metadata.insert(name.to_string(), parse_tool_runtime_policy(map));
1189                    metadata
1190                })
1191                .unwrap_or_default()
1192        }
1193        _ => BTreeMap::new(),
1194    }
1195}
1196
1197pub fn workflow_tool_policy_from_tools(value: &serde_json::Value) -> CapabilityPolicy {
1198    let tools = workflow_tool_names(value);
1199    let tool_metadata = workflow_tool_metadata(value);
1200    let mut capabilities: BTreeMap<String, Vec<String>> = BTreeMap::new();
1201    for metadata in tool_metadata.values() {
1202        for (capability, ops) in &metadata.capabilities {
1203            let entry = capabilities.entry(capability.clone()).or_default();
1204            for op in ops {
1205                if !entry.contains(op) {
1206                    entry.push(op.clone());
1207                }
1208            }
1209            entry.sort();
1210        }
1211    }
1212    let side_effect_level = max_side_effect_level(
1213        tool_metadata
1214            .values()
1215            .filter_map(|metadata| metadata.side_effect_level.clone()),
1216    );
1217    CapabilityPolicy {
1218        tools,
1219        capabilities,
1220        workspace_roots: Vec::new(),
1221        side_effect_level,
1222        recursion_limit: None,
1223        tool_arg_constraints: Vec::new(),
1224        tool_metadata,
1225    }
1226}
1227
1228#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1229#[serde(default)]
1230pub struct WorkflowEdge {
1231    pub from: String,
1232    pub to: String,
1233    pub branch: Option<String>,
1234    pub label: Option<String>,
1235}
1236
1237#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1238#[serde(default)]
1239pub struct WorkflowGraph {
1240    #[serde(rename = "_type")]
1241    pub type_name: String,
1242    pub id: String,
1243    pub name: Option<String>,
1244    pub version: usize,
1245    pub entry: String,
1246    pub nodes: BTreeMap<String, WorkflowNode>,
1247    pub edges: Vec<WorkflowEdge>,
1248    pub capability_policy: CapabilityPolicy,
1249    pub metadata: BTreeMap<String, serde_json::Value>,
1250    pub audit_log: Vec<WorkflowAuditEntry>,
1251}
1252
1253#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1254#[serde(default)]
1255pub struct WorkflowAuditEntry {
1256    pub id: String,
1257    pub op: String,
1258    pub node_id: Option<String>,
1259    pub timestamp: String,
1260    pub reason: Option<String>,
1261    pub metadata: BTreeMap<String, serde_json::Value>,
1262}
1263
1264#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1265#[serde(default)]
1266pub struct LlmUsageRecord {
1267    pub input_tokens: i64,
1268    pub output_tokens: i64,
1269    pub total_duration_ms: i64,
1270    pub call_count: i64,
1271    pub total_cost: f64,
1272    pub models: Vec<String>,
1273}
1274
1275#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1276#[serde(default)]
1277pub struct RunStageRecord {
1278    pub id: String,
1279    pub node_id: String,
1280    pub kind: String,
1281    pub status: String,
1282    pub outcome: String,
1283    pub branch: Option<String>,
1284    pub started_at: String,
1285    pub finished_at: Option<String>,
1286    pub visible_text: Option<String>,
1287    pub private_reasoning: Option<String>,
1288    pub transcript: Option<serde_json::Value>,
1289    pub verification: Option<serde_json::Value>,
1290    pub usage: Option<LlmUsageRecord>,
1291    pub artifacts: Vec<ArtifactRecord>,
1292    pub consumed_artifact_ids: Vec<String>,
1293    pub produced_artifact_ids: Vec<String>,
1294    pub attempts: Vec<RunStageAttemptRecord>,
1295    pub metadata: BTreeMap<String, serde_json::Value>,
1296}
1297
1298#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1299#[serde(default)]
1300pub struct RunStageAttemptRecord {
1301    pub attempt: usize,
1302    pub status: String,
1303    pub outcome: String,
1304    pub branch: Option<String>,
1305    pub error: Option<String>,
1306    pub verification: Option<serde_json::Value>,
1307    pub started_at: String,
1308    pub finished_at: Option<String>,
1309}
1310
1311#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1312#[serde(default)]
1313pub struct RunTransitionRecord {
1314    pub id: String,
1315    pub from_stage_id: Option<String>,
1316    pub from_node_id: Option<String>,
1317    pub to_node_id: String,
1318    pub branch: Option<String>,
1319    pub timestamp: String,
1320    pub consumed_artifact_ids: Vec<String>,
1321    pub produced_artifact_ids: Vec<String>,
1322}
1323
1324#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1325#[serde(default)]
1326pub struct RunCheckpointRecord {
1327    pub id: String,
1328    pub ready_nodes: Vec<String>,
1329    pub completed_nodes: Vec<String>,
1330    pub last_stage_id: Option<String>,
1331    pub persisted_at: String,
1332    pub reason: String,
1333}
1334
1335#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1336#[serde(default)]
1337pub struct ReplayFixture {
1338    #[serde(rename = "_type")]
1339    pub type_name: String,
1340    pub id: String,
1341    pub source_run_id: String,
1342    pub workflow_id: String,
1343    pub workflow_name: Option<String>,
1344    pub created_at: String,
1345    pub expected_status: String,
1346    pub stage_assertions: Vec<ReplayStageAssertion>,
1347}
1348
1349#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1350#[serde(default)]
1351pub struct ReplayStageAssertion {
1352    pub node_id: String,
1353    pub expected_status: String,
1354    pub expected_outcome: String,
1355    pub expected_branch: Option<String>,
1356    pub required_artifact_kinds: Vec<String>,
1357    pub visible_text_contains: Option<String>,
1358}
1359
1360#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1361#[serde(default)]
1362pub struct ReplayEvalReport {
1363    pub pass: bool,
1364    pub failures: Vec<String>,
1365    pub stage_count: usize,
1366}
1367
1368#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1369#[serde(default)]
1370pub struct ReplayEvalCaseReport {
1371    pub run_id: String,
1372    pub workflow_id: String,
1373    pub label: Option<String>,
1374    pub pass: bool,
1375    pub failures: Vec<String>,
1376    pub stage_count: usize,
1377    pub source_path: Option<String>,
1378    pub comparison: Option<RunDiffReport>,
1379}
1380
1381#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1382#[serde(default)]
1383pub struct ReplayEvalSuiteReport {
1384    pub pass: bool,
1385    pub total: usize,
1386    pub passed: usize,
1387    pub failed: usize,
1388    pub cases: Vec<ReplayEvalCaseReport>,
1389}
1390
1391#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1392#[serde(default)]
1393pub struct RunStageDiffRecord {
1394    pub node_id: String,
1395    pub change: String,
1396    pub details: Vec<String>,
1397}
1398
1399#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1400#[serde(default)]
1401pub struct RunDiffReport {
1402    pub left_run_id: String,
1403    pub right_run_id: String,
1404    pub identical: bool,
1405    pub status_changed: bool,
1406    pub left_status: String,
1407    pub right_status: String,
1408    pub stage_diffs: Vec<RunStageDiffRecord>,
1409    pub transition_count_delta: isize,
1410    pub artifact_count_delta: isize,
1411    pub checkpoint_count_delta: isize,
1412}
1413
1414#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1415#[serde(default)]
1416pub struct EvalSuiteManifest {
1417    #[serde(rename = "_type")]
1418    pub type_name: String,
1419    pub id: String,
1420    pub name: Option<String>,
1421    pub base_dir: Option<String>,
1422    pub cases: Vec<EvalSuiteCase>,
1423}
1424
1425#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1426#[serde(default)]
1427pub struct EvalSuiteCase {
1428    pub label: Option<String>,
1429    pub run_path: String,
1430    pub fixture_path: Option<String>,
1431    pub compare_to: Option<String>,
1432}
1433
1434#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1435#[serde(default)]
1436pub struct RunRecord {
1437    #[serde(rename = "_type")]
1438    pub type_name: String,
1439    pub id: String,
1440    pub workflow_id: String,
1441    pub workflow_name: Option<String>,
1442    pub task: String,
1443    pub status: String,
1444    pub started_at: String,
1445    pub finished_at: Option<String>,
1446    pub parent_run_id: Option<String>,
1447    pub root_run_id: Option<String>,
1448    pub stages: Vec<RunStageRecord>,
1449    pub transitions: Vec<RunTransitionRecord>,
1450    pub checkpoints: Vec<RunCheckpointRecord>,
1451    pub pending_nodes: Vec<String>,
1452    pub completed_nodes: Vec<String>,
1453    pub child_runs: Vec<RunChildRecord>,
1454    pub artifacts: Vec<ArtifactRecord>,
1455    pub policy: CapabilityPolicy,
1456    pub execution: Option<RunExecutionRecord>,
1457    pub transcript: Option<serde_json::Value>,
1458    pub usage: Option<LlmUsageRecord>,
1459    pub replay_fixture: Option<ReplayFixture>,
1460    pub trace_spans: Vec<RunTraceSpanRecord>,
1461    pub metadata: BTreeMap<String, serde_json::Value>,
1462    pub persisted_path: Option<String>,
1463}
1464
1465#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1466#[serde(default)]
1467pub struct RunTraceSpanRecord {
1468    pub span_id: u64,
1469    pub parent_id: Option<u64>,
1470    pub kind: String,
1471    pub name: String,
1472    pub start_ms: u64,
1473    pub duration_ms: u64,
1474    pub metadata: BTreeMap<String, serde_json::Value>,
1475}
1476
1477#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1478#[serde(default)]
1479pub struct RunChildRecord {
1480    pub worker_id: String,
1481    pub worker_name: String,
1482    pub parent_stage_id: Option<String>,
1483    pub session_id: Option<String>,
1484    pub parent_session_id: Option<String>,
1485    pub mutation_scope: Option<String>,
1486    pub approval_mode: Option<String>,
1487    pub task: String,
1488    pub status: String,
1489    pub started_at: String,
1490    pub finished_at: Option<String>,
1491    pub run_id: Option<String>,
1492    pub run_path: Option<String>,
1493    pub snapshot_path: Option<String>,
1494    pub execution: Option<RunExecutionRecord>,
1495}
1496
1497#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1498#[serde(default)]
1499pub struct RunExecutionRecord {
1500    pub cwd: Option<String>,
1501    pub source_dir: Option<String>,
1502    pub env: BTreeMap<String, String>,
1503    pub adapter: Option<String>,
1504    pub repo_path: Option<String>,
1505    pub worktree_path: Option<String>,
1506    pub branch: Option<String>,
1507    pub base_ref: Option<String>,
1508    pub cleanup: Option<String>,
1509}
1510
1511#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1512#[serde(default)]
1513pub struct WorkflowValidationReport {
1514    pub valid: bool,
1515    pub errors: Vec<String>,
1516    pub warnings: Vec<String>,
1517    pub reachable_nodes: Vec<String>,
1518}
1519
1520fn parse_json_payload<T: for<'de> Deserialize<'de>>(
1521    json: serde_json::Value,
1522    label: &str,
1523) -> Result<T, VmError> {
1524    let payload = json.to_string();
1525    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1526    let mut tracker = serde_path_to_error::Track::new();
1527    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1528    T::deserialize(path_deserializer).map_err(|error| {
1529        let snippet = if payload.len() > 600 {
1530            format!("{}...", &payload[..600])
1531        } else {
1532            payload.clone()
1533        };
1534        VmError::Runtime(format!(
1535            "{label} parse error at {}: {} | payload={}",
1536            tracker.path(),
1537            error,
1538            snippet
1539        ))
1540    })
1541}
1542
1543fn parse_json_value<T: for<'de> Deserialize<'de>>(value: &VmValue) -> Result<T, VmError> {
1544    parse_json_payload(vm_value_to_json(value), "orchestration")
1545}
1546
1547pub fn parse_workflow_node_value(value: &VmValue, label: &str) -> Result<WorkflowNode, VmError> {
1548    let mut node: WorkflowNode = parse_json_payload(vm_value_to_json(value), label)?;
1549    node.raw_tools = value.as_dict().and_then(|dict| dict.get("tools")).cloned();
1550    Ok(node)
1551}
1552
1553pub fn parse_workflow_node_json(
1554    json: serde_json::Value,
1555    label: &str,
1556) -> Result<WorkflowNode, VmError> {
1557    parse_json_payload(json, label)
1558}
1559
1560pub fn parse_workflow_edge_json(
1561    json: serde_json::Value,
1562    label: &str,
1563) -> Result<WorkflowEdge, VmError> {
1564    parse_json_payload(json, label)
1565}
1566
1567pub fn normalize_workflow_value(value: &VmValue) -> Result<WorkflowGraph, VmError> {
1568    let mut graph: WorkflowGraph = parse_json_value(value)?;
1569    let as_dict = value.as_dict().cloned().unwrap_or_default();
1570
1571    if graph.nodes.is_empty() {
1572        for key in ["act", "verify", "repair"] {
1573            if let Some(node_value) = as_dict.get(key) {
1574                let mut node = parse_workflow_node_value(node_value, "orchestration")?;
1575                let raw_node = node_value.as_dict().cloned().unwrap_or_default();
1576                node.id = Some(key.to_string());
1577                if node.kind.is_empty() {
1578                    node.kind = if key == "verify" {
1579                        "verify".to_string()
1580                    } else {
1581                        "stage".to_string()
1582                    };
1583                }
1584                if node.model_policy.provider.is_none() {
1585                    node.model_policy.provider = as_dict
1586                        .get("provider")
1587                        .map(|value| value.display())
1588                        .filter(|value| !value.is_empty());
1589                }
1590                if node.model_policy.model.is_none() {
1591                    node.model_policy.model = as_dict
1592                        .get("model")
1593                        .map(|value| value.display())
1594                        .filter(|value| !value.is_empty());
1595                }
1596                if node.model_policy.model_tier.is_none() {
1597                    node.model_policy.model_tier = as_dict
1598                        .get("model_tier")
1599                        .or_else(|| as_dict.get("tier"))
1600                        .map(|value| value.display())
1601                        .filter(|value| !value.is_empty());
1602                }
1603                if node.model_policy.temperature.is_none() {
1604                    node.model_policy.temperature = as_dict.get("temperature").and_then(|value| {
1605                        if let VmValue::Float(number) = value {
1606                            Some(*number)
1607                        } else {
1608                            value.as_int().map(|number| number as f64)
1609                        }
1610                    });
1611                }
1612                if node.model_policy.max_tokens.is_none() {
1613                    node.model_policy.max_tokens =
1614                        as_dict.get("max_tokens").and_then(|value| value.as_int());
1615                }
1616                if node.mode.is_none() {
1617                    node.mode = as_dict
1618                        .get("mode")
1619                        .map(|value| value.display())
1620                        .filter(|value| !value.is_empty());
1621                }
1622                if node.done_sentinel.is_none() {
1623                    node.done_sentinel = as_dict
1624                        .get("done_sentinel")
1625                        .map(|value| value.display())
1626                        .filter(|value| !value.is_empty());
1627                }
1628                if key == "verify"
1629                    && node.verify.is_none()
1630                    && (raw_node.contains_key("assert_text")
1631                        || raw_node.contains_key("command")
1632                        || raw_node.contains_key("expect_status")
1633                        || raw_node.contains_key("expect_text"))
1634                {
1635                    node.verify = Some(serde_json::json!({
1636                        "assert_text": raw_node.get("assert_text").map(vm_value_to_json),
1637                        "command": raw_node.get("command").map(vm_value_to_json),
1638                        "expect_status": raw_node.get("expect_status").map(vm_value_to_json),
1639                        "expect_text": raw_node.get("expect_text").map(vm_value_to_json),
1640                    }));
1641                }
1642                graph.nodes.insert(key.to_string(), node);
1643            }
1644        }
1645        if graph.entry.is_empty() && graph.nodes.contains_key("act") {
1646            graph.entry = "act".to_string();
1647        }
1648        if graph.edges.is_empty() && graph.nodes.contains_key("act") {
1649            if graph.nodes.contains_key("verify") {
1650                graph.edges.push(WorkflowEdge {
1651                    from: "act".to_string(),
1652                    to: "verify".to_string(),
1653                    branch: None,
1654                    label: None,
1655                });
1656            }
1657            if graph.nodes.contains_key("repair") {
1658                graph.edges.push(WorkflowEdge {
1659                    from: "verify".to_string(),
1660                    to: "repair".to_string(),
1661                    branch: Some("failed".to_string()),
1662                    label: None,
1663                });
1664                graph.edges.push(WorkflowEdge {
1665                    from: "repair".to_string(),
1666                    to: "verify".to_string(),
1667                    branch: Some("retry".to_string()),
1668                    label: None,
1669                });
1670            }
1671        }
1672    }
1673
1674    if graph.type_name.is_empty() {
1675        graph.type_name = "workflow_graph".to_string();
1676    }
1677    if graph.id.is_empty() {
1678        graph.id = new_id("workflow");
1679    }
1680    if graph.version == 0 {
1681        graph.version = 1;
1682    }
1683    if graph.entry.is_empty() {
1684        graph.entry = graph
1685            .nodes
1686            .keys()
1687            .next()
1688            .cloned()
1689            .unwrap_or_else(|| "act".to_string());
1690    }
1691    for (node_id, node) in &mut graph.nodes {
1692        if node.raw_tools.is_none() {
1693            node.raw_tools = as_dict
1694                .get("nodes")
1695                .and_then(|nodes| nodes.as_dict())
1696                .and_then(|nodes| nodes.get(node_id))
1697                .and_then(|node_value| node_value.as_dict())
1698                .and_then(|raw_node| raw_node.get("tools"))
1699                .cloned();
1700        }
1701        if node.id.is_none() {
1702            node.id = Some(node_id.clone());
1703        }
1704        if node.kind.is_empty() {
1705            node.kind = "stage".to_string();
1706        }
1707        if node.join_policy.strategy.is_empty() {
1708            node.join_policy.strategy = "all".to_string();
1709        }
1710        if node.reduce_policy.strategy.is_empty() {
1711            node.reduce_policy.strategy = "concat".to_string();
1712        }
1713        if node.output_contract.output_kinds.is_empty() {
1714            node.output_contract.output_kinds = vec![match node.kind.as_str() {
1715                "verify" => "verification_result".to_string(),
1716                "reduce" => node
1717                    .reduce_policy
1718                    .output_kind
1719                    .clone()
1720                    .unwrap_or_else(|| "summary".to_string()),
1721                "map" => node
1722                    .map_policy
1723                    .output_kind
1724                    .clone()
1725                    .unwrap_or_else(|| "artifact".to_string()),
1726                "escalation" => "plan".to_string(),
1727                _ => "artifact".to_string(),
1728            }];
1729        }
1730        if node.retry_policy.max_attempts == 0 {
1731            node.retry_policy.max_attempts = 1;
1732        }
1733    }
1734    Ok(graph)
1735}
1736
1737pub fn validate_workflow(
1738    graph: &WorkflowGraph,
1739    ceiling: Option<&CapabilityPolicy>,
1740) -> WorkflowValidationReport {
1741    let mut errors = Vec::new();
1742    let mut warnings = Vec::new();
1743
1744    if !graph.nodes.contains_key(&graph.entry) {
1745        errors.push(format!("entry node does not exist: {}", graph.entry));
1746    }
1747
1748    let node_ids: BTreeSet<String> = graph.nodes.keys().cloned().collect();
1749    for edge in &graph.edges {
1750        if !node_ids.contains(&edge.from) {
1751            errors.push(format!("edge.from references unknown node: {}", edge.from));
1752        }
1753        if !node_ids.contains(&edge.to) {
1754            errors.push(format!("edge.to references unknown node: {}", edge.to));
1755        }
1756    }
1757
1758    let reachable_nodes = reachable_nodes(graph);
1759    for node_id in &node_ids {
1760        if !reachable_nodes.contains(node_id) {
1761            warnings.push(format!("node is unreachable: {node_id}"));
1762        }
1763    }
1764
1765    for (node_id, node) in &graph.nodes {
1766        let incoming = graph
1767            .edges
1768            .iter()
1769            .filter(|edge| edge.to == *node_id)
1770            .count();
1771        let outgoing: Vec<&WorkflowEdge> = graph
1772            .edges
1773            .iter()
1774            .filter(|edge| edge.from == *node_id)
1775            .collect();
1776        if let Some(min_inputs) = node.input_contract.min_inputs {
1777            if let Some(max_inputs) = node.input_contract.max_inputs {
1778                if min_inputs > max_inputs {
1779                    errors.push(format!(
1780                        "node {node_id}: input contract min_inputs exceeds max_inputs"
1781                    ));
1782                }
1783            }
1784        }
1785        match node.kind.as_str() {
1786            "condition" => {
1787                let has_true = outgoing
1788                    .iter()
1789                    .any(|edge| edge.branch.as_deref() == Some("true"));
1790                let has_false = outgoing
1791                    .iter()
1792                    .any(|edge| edge.branch.as_deref() == Some("false"));
1793                if !has_true || !has_false {
1794                    errors.push(format!(
1795                        "node {node_id}: condition nodes require both 'true' and 'false' branch edges"
1796                    ));
1797                }
1798            }
1799            "fork" => {
1800                if outgoing.len() < 2 {
1801                    errors.push(format!(
1802                        "node {node_id}: fork nodes require at least two outgoing edges"
1803                    ));
1804                }
1805            }
1806            "join" => {
1807                if incoming < 2 {
1808                    warnings.push(format!(
1809                        "node {node_id}: join node has fewer than two incoming edges"
1810                    ));
1811                }
1812            }
1813            "map" => {
1814                if node.map_policy.items.is_empty()
1815                    && node.map_policy.item_artifact_kind.is_none()
1816                    && node.input_contract.input_kinds.is_empty()
1817                {
1818                    errors.push(format!(
1819                        "node {node_id}: map nodes require items, item_artifact_kind, or input_contract.input_kinds"
1820                    ));
1821                }
1822            }
1823            "reduce" => {
1824                if node.input_contract.input_kinds.is_empty() {
1825                    warnings.push(format!(
1826                        "node {node_id}: reduce node has no input_contract.input_kinds; it will consume all available artifacts"
1827                    ));
1828                }
1829            }
1830            _ => {}
1831        }
1832    }
1833
1834    if let Some(ceiling) = ceiling {
1835        if let Err(error) = ceiling.intersect(&graph.capability_policy) {
1836            errors.push(error);
1837        }
1838        for (node_id, node) in &graph.nodes {
1839            if let Err(error) = ceiling.intersect(&node.capability_policy) {
1840                errors.push(format!("node {node_id}: {error}"));
1841            }
1842        }
1843    }
1844
1845    WorkflowValidationReport {
1846        valid: errors.is_empty(),
1847        errors,
1848        warnings,
1849        reachable_nodes: reachable_nodes.into_iter().collect(),
1850    }
1851}
1852
1853fn reachable_nodes(graph: &WorkflowGraph) -> BTreeSet<String> {
1854    let mut seen = BTreeSet::new();
1855    let mut stack = vec![graph.entry.clone()];
1856    while let Some(node_id) = stack.pop() {
1857        if !seen.insert(node_id.clone()) {
1858            continue;
1859        }
1860        for edge in graph.edges.iter().filter(|edge| edge.from == node_id) {
1861            stack.push(edge.to.clone());
1862        }
1863    }
1864    seen
1865}
1866
1867pub fn select_artifacts(
1868    mut artifacts: Vec<ArtifactRecord>,
1869    policy: &ContextPolicy,
1870) -> Vec<ArtifactRecord> {
1871    artifacts.retain(|artifact| {
1872        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
1873            && !policy.exclude_kinds.contains(&artifact.kind)
1874            && (policy.include_stages.is_empty()
1875                || artifact
1876                    .stage
1877                    .as_ref()
1878                    .is_some_and(|stage| policy.include_stages.contains(stage)))
1879    });
1880    artifacts.sort_by(|a, b| {
1881        let b_pinned = policy.pinned_ids.contains(&b.id);
1882        let a_pinned = policy.pinned_ids.contains(&a.id);
1883        b_pinned
1884            .cmp(&a_pinned)
1885            .then_with(|| {
1886                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
1887                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
1888                b_prio_kind.cmp(&a_prio_kind)
1889            })
1890            .then_with(|| {
1891                b.priority
1892                    .unwrap_or_default()
1893                    .cmp(&a.priority.unwrap_or_default())
1894            })
1895            .then_with(|| {
1896                if policy.prefer_fresh {
1897                    freshness_rank(b.freshness.as_deref())
1898                        .cmp(&freshness_rank(a.freshness.as_deref()))
1899                } else {
1900                    std::cmp::Ordering::Equal
1901                }
1902            })
1903            .then_with(|| {
1904                if policy.prefer_recent {
1905                    b.created_at.cmp(&a.created_at)
1906                } else {
1907                    std::cmp::Ordering::Equal
1908                }
1909            })
1910            .then_with(|| {
1911                b.relevance
1912                    .partial_cmp(&a.relevance)
1913                    .unwrap_or(std::cmp::Ordering::Equal)
1914            })
1915            .then_with(|| {
1916                a.estimated_tokens
1917                    .unwrap_or(usize::MAX)
1918                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
1919            })
1920    });
1921
1922    let mut selected = Vec::new();
1923    let mut used_tokens = 0usize;
1924    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
1925    let effective_max_tokens = policy
1926        .max_tokens
1927        .map(|max| max.saturating_sub(reserve_tokens));
1928    for artifact in artifacts {
1929        if let Some(max_artifacts) = policy.max_artifacts {
1930            if selected.len() >= max_artifacts {
1931                break;
1932            }
1933        }
1934        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
1935        if let Some(max_tokens) = effective_max_tokens {
1936            if used_tokens + next_tokens > max_tokens {
1937                continue;
1938            }
1939        }
1940        used_tokens += next_tokens;
1941        selected.push(artifact);
1942    }
1943    selected
1944}
1945
1946pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
1947    let mut parts = Vec::new();
1948    for artifact in artifacts {
1949        let title = artifact
1950            .title
1951            .clone()
1952            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
1953        let body = artifact
1954            .text
1955            .clone()
1956            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
1957            .unwrap_or_default();
1958        match policy.render.as_deref() {
1959            Some("json") => {
1960                parts.push(
1961                    serde_json::json!({
1962                        "id": artifact.id,
1963                        "kind": artifact.kind,
1964                        "title": title,
1965                        "source": artifact.source,
1966                        "freshness": artifact.freshness,
1967                        "priority": artifact.priority,
1968                        "text": body,
1969                    })
1970                    .to_string(),
1971                );
1972            }
1973            _ => parts.push(format!(
1974                "[{title}] kind={} source={} freshness={} priority={}\n{}",
1975                artifact.kind,
1976                artifact
1977                    .source
1978                    .clone()
1979                    .unwrap_or_else(|| "unknown".to_string()),
1980                artifact
1981                    .freshness
1982                    .clone()
1983                    .unwrap_or_else(|| "normal".to_string()),
1984                artifact.priority.unwrap_or_default(),
1985                body
1986            )),
1987        }
1988    }
1989    parts.join("\n\n")
1990}
1991
1992pub fn normalize_artifact(value: &VmValue) -> Result<ArtifactRecord, VmError> {
1993    let artifact: ArtifactRecord = parse_json_value(value)?;
1994    Ok(artifact.normalize())
1995}
1996
1997pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1998    let json = vm_value_to_json(value);
1999    let payload = json.to_string();
2000    let mut deserializer = serde_json::Deserializer::from_str(&payload);
2001    let mut tracker = serde_path_to_error::Track::new();
2002    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
2003    let mut run: RunRecord = RunRecord::deserialize(path_deserializer).map_err(|error| {
2004        let snippet = if payload.len() > 600 {
2005            format!("{}...", &payload[..600])
2006        } else {
2007            payload.clone()
2008        };
2009        VmError::Runtime(format!(
2010            "orchestration parse error at {}: {} | payload={}",
2011            tracker.path(),
2012            error,
2013            snippet
2014        ))
2015    })?;
2016    if run.type_name.is_empty() {
2017        run.type_name = "run_record".to_string();
2018    }
2019    if run.id.is_empty() {
2020        run.id = new_id("run");
2021    }
2022    if run.started_at.is_empty() {
2023        run.started_at = now_rfc3339();
2024    }
2025    if run.status.is_empty() {
2026        run.status = "running".to_string();
2027    }
2028    if run.root_run_id.is_none() {
2029        run.root_run_id = Some(run.id.clone());
2030    }
2031    if run.replay_fixture.is_none() {
2032        run.replay_fixture = Some(replay_fixture_from_run(&run));
2033    }
2034    Ok(run)
2035}
2036
2037pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
2038    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
2039    if manifest.type_name.is_empty() {
2040        manifest.type_name = "eval_suite_manifest".to_string();
2041    }
2042    if manifest.id.is_empty() {
2043        manifest.id = new_id("eval_suite");
2044    }
2045    Ok(manifest)
2046}
2047
2048fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
2049    let content = std::fs::read_to_string(path)
2050        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
2051    serde_json::from_str(&content)
2052        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
2053}
2054
2055fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
2056    let path_buf = PathBuf::from(path);
2057    if path_buf.is_absolute() {
2058        path_buf
2059    } else if let Some(base_dir) = base_dir {
2060        base_dir.join(path_buf)
2061    } else {
2062        path_buf
2063    }
2064}
2065
2066pub fn evaluate_run_suite_manifest(
2067    manifest: &EvalSuiteManifest,
2068) -> Result<ReplayEvalSuiteReport, VmError> {
2069    let base_dir = manifest.base_dir.as_deref().map(Path::new);
2070    let mut reports = Vec::new();
2071    for case in &manifest.cases {
2072        let run_path = resolve_manifest_path(base_dir, &case.run_path);
2073        let run = load_run_record(&run_path)?;
2074        let fixture = match &case.fixture_path {
2075            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
2076            None => run
2077                .replay_fixture
2078                .clone()
2079                .unwrap_or_else(|| replay_fixture_from_run(&run)),
2080        };
2081        let eval = evaluate_run_against_fixture(&run, &fixture);
2082        let mut pass = eval.pass;
2083        let mut failures = eval.failures;
2084        let comparison = match &case.compare_to {
2085            Some(path) => {
2086                let baseline_path = resolve_manifest_path(base_dir, path);
2087                let baseline = load_run_record(&baseline_path)?;
2088                let diff = diff_run_records(&baseline, &run);
2089                if !diff.identical {
2090                    pass = false;
2091                    failures.push(format!(
2092                        "run differs from baseline {} with {} stage changes",
2093                        baseline_path.display(),
2094                        diff.stage_diffs.len()
2095                    ));
2096                }
2097                Some(diff)
2098            }
2099            None => None,
2100        };
2101        reports.push(ReplayEvalCaseReport {
2102            run_id: run.id.clone(),
2103            workflow_id: run.workflow_id.clone(),
2104            label: case.label.clone(),
2105            pass,
2106            failures,
2107            stage_count: eval.stage_count,
2108            source_path: Some(run_path.display().to_string()),
2109            comparison,
2110        });
2111    }
2112    let total = reports.len();
2113    let passed = reports.iter().filter(|report| report.pass).count();
2114    let failed = total.saturating_sub(passed);
2115    Ok(ReplayEvalSuiteReport {
2116        pass: failed == 0,
2117        total,
2118        passed,
2119        failed,
2120        cases: reports,
2121    })
2122}
2123
2124pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2125    let before_lines: Vec<&str> = before.lines().collect();
2126    let after_lines: Vec<&str> = after.lines().collect();
2127    let mut table = vec![vec![0usize; after_lines.len() + 1]; before_lines.len() + 1];
2128    for i in (0..before_lines.len()).rev() {
2129        for j in (0..after_lines.len()).rev() {
2130            table[i][j] = if before_lines[i] == after_lines[j] {
2131                table[i + 1][j + 1] + 1
2132            } else {
2133                table[i + 1][j].max(table[i][j + 1])
2134            };
2135        }
2136    }
2137
2138    let mut diff = String::new();
2139    let file = path.unwrap_or("artifact");
2140    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2141    let mut i = 0;
2142    let mut j = 0;
2143    while i < before_lines.len() && j < after_lines.len() {
2144        if before_lines[i] == after_lines[j] {
2145            diff.push_str(&format!(" {}\n", before_lines[i]));
2146            i += 1;
2147            j += 1;
2148        } else if table[i + 1][j] >= table[i][j + 1] {
2149            diff.push_str(&format!("-{}\n", before_lines[i]));
2150            i += 1;
2151        } else {
2152            diff.push_str(&format!("+{}\n", after_lines[j]));
2153            j += 1;
2154        }
2155    }
2156    while i < before_lines.len() {
2157        diff.push_str(&format!("-{}\n", before_lines[i]));
2158        i += 1;
2159    }
2160    while j < after_lines.len() {
2161        diff.push_str(&format!("+{}\n", after_lines[j]));
2162        j += 1;
2163    }
2164    diff
2165}
2166
2167pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2168    let path = path
2169        .map(PathBuf::from)
2170        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2171    if let Some(parent) = path.parent() {
2172        std::fs::create_dir_all(parent)
2173            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2174    }
2175    let json = serde_json::to_string_pretty(run)
2176        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2177    // Atomic write: write to .tmp then rename to prevent corruption on kill.
2178    let tmp_path = path.with_extension("json.tmp");
2179    std::fs::write(&tmp_path, &json)
2180        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2181    std::fs::rename(&tmp_path, &path).map_err(|e| {
2182        // Fallback: if rename fails (cross-device), write directly.
2183        let _ = std::fs::write(&path, &json);
2184        VmError::Runtime(format!("failed to finalize run record: {e}"))
2185    })?;
2186    Ok(path.to_string_lossy().to_string())
2187}
2188
2189pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2190    let content = std::fs::read_to_string(path)
2191        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2192    serde_json::from_str(&content)
2193        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))
2194}
2195
2196pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2197    ReplayFixture {
2198        type_name: "replay_fixture".to_string(),
2199        id: new_id("fixture"),
2200        source_run_id: run.id.clone(),
2201        workflow_id: run.workflow_id.clone(),
2202        workflow_name: run.workflow_name.clone(),
2203        created_at: now_rfc3339(),
2204        expected_status: run.status.clone(),
2205        stage_assertions: run
2206            .stages
2207            .iter()
2208            .map(|stage| ReplayStageAssertion {
2209                node_id: stage.node_id.clone(),
2210                expected_status: stage.status.clone(),
2211                expected_outcome: stage.outcome.clone(),
2212                expected_branch: stage.branch.clone(),
2213                required_artifact_kinds: stage
2214                    .artifacts
2215                    .iter()
2216                    .map(|artifact| artifact.kind.clone())
2217                    .collect(),
2218                visible_text_contains: stage
2219                    .visible_text
2220                    .as_ref()
2221                    .filter(|text| !text.is_empty())
2222                    .map(|text| text.chars().take(80).collect()),
2223            })
2224            .collect(),
2225    }
2226}
2227
2228pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2229    let mut failures = Vec::new();
2230    if run.status != fixture.expected_status {
2231        failures.push(format!(
2232            "run status mismatch: expected {}, got {}",
2233            fixture.expected_status, run.status
2234        ));
2235    }
2236    for assertion in &fixture.stage_assertions {
2237        let Some(stage) = run
2238            .stages
2239            .iter()
2240            .find(|stage| stage.node_id == assertion.node_id)
2241        else {
2242            failures.push(format!("missing stage {}", assertion.node_id));
2243            continue;
2244        };
2245        if stage.status != assertion.expected_status {
2246            failures.push(format!(
2247                "stage {} status mismatch: expected {}, got {}",
2248                assertion.node_id, assertion.expected_status, stage.status
2249            ));
2250        }
2251        if stage.outcome != assertion.expected_outcome {
2252            failures.push(format!(
2253                "stage {} outcome mismatch: expected {}, got {}",
2254                assertion.node_id, assertion.expected_outcome, stage.outcome
2255            ));
2256        }
2257        if stage.branch != assertion.expected_branch {
2258            failures.push(format!(
2259                "stage {} branch mismatch: expected {:?}, got {:?}",
2260                assertion.node_id, assertion.expected_branch, stage.branch
2261            ));
2262        }
2263        for required_kind in &assertion.required_artifact_kinds {
2264            if !stage
2265                .artifacts
2266                .iter()
2267                .any(|artifact| &artifact.kind == required_kind)
2268            {
2269                failures.push(format!(
2270                    "stage {} missing artifact kind {}",
2271                    assertion.node_id, required_kind
2272                ));
2273            }
2274        }
2275        if let Some(snippet) = &assertion.visible_text_contains {
2276            let actual = stage.visible_text.clone().unwrap_or_default();
2277            if !actual.contains(snippet) {
2278                failures.push(format!(
2279                    "stage {} visible text does not contain expected snippet {:?}",
2280                    assertion.node_id, snippet
2281                ));
2282            }
2283        }
2284    }
2285
2286    ReplayEvalReport {
2287        pass: failures.is_empty(),
2288        failures,
2289        stage_count: run.stages.len(),
2290    }
2291}
2292
2293pub fn evaluate_run_suite(
2294    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2295) -> ReplayEvalSuiteReport {
2296    let mut reports = Vec::new();
2297    for (run, fixture, source_path) in cases {
2298        let report = evaluate_run_against_fixture(&run, &fixture);
2299        reports.push(ReplayEvalCaseReport {
2300            run_id: run.id.clone(),
2301            workflow_id: run.workflow_id.clone(),
2302            label: None,
2303            pass: report.pass,
2304            failures: report.failures,
2305            stage_count: report.stage_count,
2306            source_path,
2307            comparison: None,
2308        });
2309    }
2310    let total = reports.len();
2311    let passed = reports.iter().filter(|report| report.pass).count();
2312    let failed = total.saturating_sub(passed);
2313    ReplayEvalSuiteReport {
2314        pass: failed == 0,
2315        total,
2316        passed,
2317        failed,
2318        cases: reports,
2319    }
2320}
2321
2322pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2323    let mut stage_diffs = Vec::new();
2324    let mut all_node_ids = BTreeSet::new();
2325    all_node_ids.extend(left.stages.iter().map(|stage| stage.node_id.clone()));
2326    all_node_ids.extend(right.stages.iter().map(|stage| stage.node_id.clone()));
2327
2328    for node_id in all_node_ids {
2329        let left_stage = left.stages.iter().find(|stage| stage.node_id == node_id);
2330        let right_stage = right.stages.iter().find(|stage| stage.node_id == node_id);
2331        match (left_stage, right_stage) {
2332            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2333                node_id,
2334                change: "removed".to_string(),
2335                details: vec!["stage missing from right run".to_string()],
2336            }),
2337            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2338                node_id,
2339                change: "added".to_string(),
2340                details: vec!["stage missing from left run".to_string()],
2341            }),
2342            (Some(left_stage), Some(right_stage)) => {
2343                let mut details = Vec::new();
2344                if left_stage.status != right_stage.status {
2345                    details.push(format!(
2346                        "status: {} -> {}",
2347                        left_stage.status, right_stage.status
2348                    ));
2349                }
2350                if left_stage.outcome != right_stage.outcome {
2351                    details.push(format!(
2352                        "outcome: {} -> {}",
2353                        left_stage.outcome, right_stage.outcome
2354                    ));
2355                }
2356                if left_stage.branch != right_stage.branch {
2357                    details.push(format!(
2358                        "branch: {:?} -> {:?}",
2359                        left_stage.branch, right_stage.branch
2360                    ));
2361                }
2362                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2363                {
2364                    details.push(format!(
2365                        "produced_artifacts: {} -> {}",
2366                        left_stage.produced_artifact_ids.len(),
2367                        right_stage.produced_artifact_ids.len()
2368                    ));
2369                }
2370                if left_stage.artifacts.len() != right_stage.artifacts.len() {
2371                    details.push(format!(
2372                        "artifact_records: {} -> {}",
2373                        left_stage.artifacts.len(),
2374                        right_stage.artifacts.len()
2375                    ));
2376                }
2377                if !details.is_empty() {
2378                    stage_diffs.push(RunStageDiffRecord {
2379                        node_id,
2380                        change: "changed".to_string(),
2381                        details,
2382                    });
2383                }
2384            }
2385            (None, None) => {}
2386        }
2387    }
2388
2389    let status_changed = left.status != right.status;
2390    let identical = !status_changed
2391        && stage_diffs.is_empty()
2392        && left.transitions.len() == right.transitions.len()
2393        && left.artifacts.len() == right.artifacts.len()
2394        && left.checkpoints.len() == right.checkpoints.len();
2395
2396    RunDiffReport {
2397        left_run_id: left.id.clone(),
2398        right_run_id: right.id.clone(),
2399        identical,
2400        status_changed,
2401        left_status: left.status.clone(),
2402        right_status: right.status.clone(),
2403        stage_diffs,
2404        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
2405        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
2406        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
2407    }
2408}
2409
2410pub fn push_execution_policy(policy: CapabilityPolicy) {
2411    EXECUTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
2412}
2413
2414pub fn pop_execution_policy() {
2415    EXECUTION_POLICY_STACK.with(|stack| {
2416        stack.borrow_mut().pop();
2417    });
2418}
2419
2420pub fn current_execution_policy() -> Option<CapabilityPolicy> {
2421    EXECUTION_POLICY_STACK.with(|stack| stack.borrow().last().cloned())
2422}
2423
2424pub fn current_tool_metadata(tool: &str) -> Option<ToolRuntimePolicyMetadata> {
2425    current_execution_policy().and_then(|policy| policy.tool_metadata.get(tool).cloned())
2426}
2427
2428fn policy_allows_tool(policy: &CapabilityPolicy, tool: &str) -> bool {
2429    policy.tools.is_empty() || policy.tools.iter().any(|allowed| allowed == tool)
2430}
2431
2432fn policy_allows_capability(policy: &CapabilityPolicy, capability: &str, op: &str) -> bool {
2433    policy.capabilities.is_empty()
2434        || policy
2435            .capabilities
2436            .get(capability)
2437            .is_some_and(|ops| ops.is_empty() || ops.iter().any(|allowed| allowed == op))
2438}
2439
2440fn policy_allows_side_effect(policy: &CapabilityPolicy, requested: &str) -> bool {
2441    fn rank(v: &str) -> usize {
2442        match v {
2443            "none" => 0,
2444            "read_only" => 1,
2445            "workspace_write" => 2,
2446            "process_exec" => 3,
2447            "network" => 4,
2448            _ => 5,
2449        }
2450    }
2451    policy
2452        .side_effect_level
2453        .as_ref()
2454        .map(|allowed| rank(allowed) >= rank(requested))
2455        .unwrap_or(true)
2456}
2457
2458fn reject_policy(reason: String) -> Result<(), VmError> {
2459    Err(VmError::CategorizedError {
2460        message: reason,
2461        category: crate::value::ErrorCategory::ToolRejected,
2462    })
2463}
2464
2465fn fallback_mutation_classification(tool_name: &str) -> String {
2466    let lower = tool_name.to_ascii_lowercase();
2467    if lower.starts_with("mcp_") {
2468        return "host_defined".to_string();
2469    }
2470    if lower == "exec"
2471        || lower == "shell"
2472        || lower == "exec_at"
2473        || lower == "shell_at"
2474        || lower == "run"
2475        || lower.starts_with("run_")
2476    {
2477        return "ambient_side_effect".to_string();
2478    }
2479    if lower.starts_with("delete")
2480        || lower.starts_with("remove")
2481        || lower.starts_with("move")
2482        || lower.starts_with("rename")
2483    {
2484        return "destructive".to_string();
2485    }
2486    if lower.contains("write")
2487        || lower.contains("edit")
2488        || lower.contains("patch")
2489        || lower.contains("create")
2490        || lower.contains("scaffold")
2491        || lower.starts_with("insert")
2492        || lower.starts_with("replace")
2493        || lower == "add_import"
2494    {
2495        return "apply_workspace".to_string();
2496    }
2497    "read_only".to_string()
2498}
2499
2500pub fn current_tool_mutation_classification(tool_name: &str) -> String {
2501    current_tool_metadata(tool_name)
2502        .and_then(|metadata| metadata.mutation_classification)
2503        .unwrap_or_else(|| fallback_mutation_classification(tool_name))
2504}
2505
2506pub fn current_tool_declared_paths(tool_name: &str, args: &serde_json::Value) -> Vec<String> {
2507    let Some(map) = args.as_object() else {
2508        return Vec::new();
2509    };
2510    let path_keys = current_tool_metadata(tool_name)
2511        .map(|metadata| metadata.path_params)
2512        .filter(|keys| !keys.is_empty())
2513        .unwrap_or_else(|| {
2514            vec![
2515                "path".to_string(),
2516                "file".to_string(),
2517                "cwd".to_string(),
2518                "repo".to_string(),
2519                "target".to_string(),
2520                "destination".to_string(),
2521            ]
2522        });
2523    let mut paths = Vec::new();
2524    for key in path_keys {
2525        if let Some(value) = map.get(&key).and_then(|value| value.as_str()) {
2526            if !value.is_empty() {
2527                paths.push(value.to_string());
2528            }
2529        }
2530    }
2531    if let Some(items) = map.get("paths").and_then(|value| value.as_array()) {
2532        for item in items {
2533            if let Some(value) = item.as_str() {
2534                if !value.is_empty() {
2535                    paths.push(value.to_string());
2536                }
2537            }
2538        }
2539    }
2540    paths.sort();
2541    paths.dedup();
2542    paths
2543}
2544
2545pub fn enforce_current_policy_for_builtin(name: &str, args: &[VmValue]) -> Result<(), VmError> {
2546    let Some(policy) = current_execution_policy() else {
2547        return Ok(());
2548    };
2549    match name {
2550        "read" | "read_file" => {
2551            if !policy_allows_tool(&policy, name)
2552                || !policy_allows_capability(&policy, "workspace", "read_text")
2553            {
2554                return reject_policy(format!(
2555                    "builtin '{name}' exceeds workspace.read_text ceiling"
2556                ));
2557            }
2558        }
2559        "search" | "list_dir" => {
2560            if !policy_allows_tool(&policy, name)
2561                || !policy_allows_capability(&policy, "workspace", "list")
2562            {
2563                return reject_policy(format!("builtin '{name}' exceeds workspace.list ceiling"));
2564            }
2565        }
2566        "file_exists" | "stat" => {
2567            if !policy_allows_capability(&policy, "workspace", "exists") {
2568                return reject_policy(format!("builtin '{name}' exceeds workspace.exists ceiling"));
2569            }
2570        }
2571        "edit" | "write_file" | "append_file" | "mkdir" | "copy_file" => {
2572            if !policy_allows_tool(&policy, "edit")
2573                || !policy_allows_capability(&policy, "workspace", "write_text")
2574                || !policy_allows_side_effect(&policy, "workspace_write")
2575            {
2576                return reject_policy(format!("builtin '{name}' exceeds workspace write ceiling"));
2577            }
2578        }
2579        "delete_file" => {
2580            if !policy_allows_capability(&policy, "workspace", "delete")
2581                || !policy_allows_side_effect(&policy, "workspace_write")
2582            {
2583                return reject_policy(
2584                    "builtin 'delete_file' exceeds workspace.delete ceiling".to_string(),
2585                );
2586            }
2587        }
2588        "apply_edit" => {
2589            if !policy_allows_capability(&policy, "workspace", "apply_edit")
2590                || !policy_allows_side_effect(&policy, "workspace_write")
2591            {
2592                return reject_policy(
2593                    "builtin 'apply_edit' exceeds workspace.apply_edit ceiling".to_string(),
2594                );
2595            }
2596        }
2597        "exec" | "exec_at" | "shell" | "shell_at" | "run_command" => {
2598            if !policy_allows_tool(&policy, "run")
2599                || !policy_allows_capability(&policy, "process", "exec")
2600                || !policy_allows_side_effect(&policy, "process_exec")
2601            {
2602                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2603            }
2604        }
2605        "http_get" | "http_post" | "http_put" | "http_patch" | "http_delete" | "http_request" => {
2606            if !policy_allows_side_effect(&policy, "network") {
2607                return reject_policy(format!("builtin '{name}' exceeds network ceiling"));
2608            }
2609        }
2610        "mcp_connect"
2611        | "mcp_call"
2612        | "mcp_list_tools"
2613        | "mcp_list_resources"
2614        | "mcp_list_resource_templates"
2615        | "mcp_read_resource"
2616        | "mcp_list_prompts"
2617        | "mcp_get_prompt"
2618        | "mcp_server_info"
2619        | "mcp_disconnect" => {
2620            if !policy_allows_tool(&policy, "run")
2621                || !policy_allows_capability(&policy, "process", "exec")
2622                || !policy_allows_side_effect(&policy, "process_exec")
2623            {
2624                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2625            }
2626        }
2627        "host_call" => {
2628            let name = args.first().map(|v| v.display()).unwrap_or_default();
2629            let Some((capability, op)) = name.split_once('.') else {
2630                return reject_policy(format!(
2631                    "host_call '{name}' must use capability.operation naming"
2632                ));
2633            };
2634            if !policy_allows_capability(&policy, capability, op) {
2635                return reject_policy(format!(
2636                    "host_call {capability}.{op} exceeds capability ceiling"
2637                ));
2638            }
2639            let requested_side_effect = match (capability, op) {
2640                ("workspace", "write_text" | "apply_edit" | "delete") => "workspace_write",
2641                ("process", "exec") => "process_exec",
2642                _ => "read_only",
2643            };
2644            if !policy_allows_side_effect(&policy, requested_side_effect) {
2645                return reject_policy(format!(
2646                    "host_call {capability}.{op} exceeds side-effect ceiling"
2647                ));
2648            }
2649        }
2650        _ => {}
2651    }
2652    Ok(())
2653}
2654
2655pub fn enforce_current_policy_for_bridge_builtin(name: &str) -> Result<(), VmError> {
2656    if current_execution_policy().is_some() {
2657        return reject_policy(format!(
2658            "bridged builtin '{name}' exceeds execution policy; declare an explicit capability/tool surface instead"
2659        ));
2660    }
2661    Ok(())
2662}
2663
2664pub fn enforce_current_policy_for_tool(tool_name: &str) -> Result<(), VmError> {
2665    let Some(policy) = current_execution_policy() else {
2666        return Ok(());
2667    };
2668    if !policy_allows_tool(&policy, tool_name) {
2669        return reject_policy(format!("tool '{tool_name}' exceeds tool ceiling"));
2670    }
2671    if let Some(metadata) = policy.tool_metadata.get(tool_name) {
2672        for (capability, ops) in &metadata.capabilities {
2673            for op in ops {
2674                if !policy_allows_capability(&policy, capability, op) {
2675                    return reject_policy(format!(
2676                        "tool '{tool_name}' exceeds capability ceiling: {capability}.{op}"
2677                    ));
2678                }
2679            }
2680        }
2681        if let Some(side_effect_level) = metadata.side_effect_level.as_deref() {
2682            if !policy_allows_side_effect(&policy, side_effect_level) {
2683                return reject_policy(format!(
2684                    "tool '{tool_name}' exceeds side-effect ceiling: {side_effect_level}"
2685                ));
2686            }
2687        }
2688    }
2689    Ok(())
2690}
2691
2692fn compact_transcript(transcript: &VmValue, keep_last: usize) -> Option<VmValue> {
2693    let dict = transcript.as_dict()?;
2694    let messages = match dict.get("messages") {
2695        Some(VmValue::List(list)) => list.iter().cloned().collect::<Vec<_>>(),
2696        _ => Vec::new(),
2697    };
2698    let retained = messages
2699        .into_iter()
2700        .rev()
2701        .take(keep_last)
2702        .collect::<Vec<_>>()
2703        .into_iter()
2704        .rev()
2705        .collect::<Vec<_>>();
2706    let mut compacted = dict.clone();
2707    compacted.insert(
2708        "messages".to_string(),
2709        VmValue::List(Rc::new(retained.clone())),
2710    );
2711    compacted.insert(
2712        "events".to_string(),
2713        VmValue::List(Rc::new(
2714            crate::llm::helpers::transcript_events_from_messages(&retained),
2715        )),
2716    );
2717    Some(VmValue::Dict(Rc::new(compacted)))
2718}
2719
2720fn redact_transcript_visibility(transcript: &VmValue, visibility: Option<&str>) -> Option<VmValue> {
2721    let Some(visibility) = visibility else {
2722        return Some(transcript.clone());
2723    };
2724    if visibility != "public" && visibility != "public_only" {
2725        return Some(transcript.clone());
2726    }
2727    let dict = transcript.as_dict()?;
2728    let public_messages = match dict.get("messages") {
2729        Some(VmValue::List(list)) => list
2730            .iter()
2731            .filter(|message| {
2732                message
2733                    .as_dict()
2734                    .and_then(|d| d.get("role"))
2735                    .map(|v| v.display())
2736                    .map(|role| role != "tool_result")
2737                    .unwrap_or(true)
2738            })
2739            .cloned()
2740            .collect::<Vec<_>>(),
2741        _ => Vec::new(),
2742    };
2743    let public_events = match dict.get("events") {
2744        Some(VmValue::List(list)) => list
2745            .iter()
2746            .filter(|event| {
2747                event
2748                    .as_dict()
2749                    .and_then(|d| d.get("visibility"))
2750                    .map(|v| v.display())
2751                    .map(|value| value == "public")
2752                    .unwrap_or(true)
2753            })
2754            .cloned()
2755            .collect::<Vec<_>>(),
2756        _ => Vec::new(),
2757    };
2758    let mut redacted = dict.clone();
2759    redacted.insert(
2760        "messages".to_string(),
2761        VmValue::List(Rc::new(public_messages)),
2762    );
2763    redacted.insert("events".to_string(), VmValue::List(Rc::new(public_events)));
2764    Some(VmValue::Dict(Rc::new(redacted)))
2765}
2766
2767pub(crate) fn apply_input_transcript_policy(
2768    transcript: Option<VmValue>,
2769    policy: &TranscriptPolicy,
2770) -> Option<VmValue> {
2771    let mut transcript = transcript;
2772    match policy.mode.as_deref() {
2773        Some("reset") => return None,
2774        Some("fork") => {
2775            if let Some(VmValue::Dict(dict)) = transcript.as_ref() {
2776                let mut forked = dict.as_ref().clone();
2777                forked.insert(
2778                    "id".to_string(),
2779                    VmValue::String(Rc::from(new_id("transcript"))),
2780                );
2781                transcript = Some(VmValue::Dict(Rc::new(forked)));
2782            }
2783        }
2784        _ => {}
2785    }
2786    if policy.compact {
2787        let keep_last = policy.keep_last.unwrap_or(6);
2788        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2789    }
2790    transcript
2791}
2792
2793fn apply_output_transcript_policy(
2794    transcript: Option<VmValue>,
2795    policy: &TranscriptPolicy,
2796) -> Option<VmValue> {
2797    let mut transcript = transcript;
2798    if policy.compact {
2799        let keep_last = policy.keep_last.unwrap_or(6);
2800        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2801    }
2802    transcript.and_then(|value| redact_transcript_visibility(&value, policy.visibility.as_deref()))
2803}
2804
2805pub async fn execute_stage_node(
2806    node_id: &str,
2807    node: &WorkflowNode,
2808    task: &str,
2809    artifacts: &[ArtifactRecord],
2810    transcript: Option<VmValue>,
2811) -> Result<(serde_json::Value, Vec<ArtifactRecord>, Option<VmValue>), VmError> {
2812    let mut selection_policy = node.context_policy.clone();
2813    if selection_policy.include_kinds.is_empty() && !node.input_contract.input_kinds.is_empty() {
2814        selection_policy.include_kinds = node.input_contract.input_kinds.clone();
2815    }
2816    let selected = select_artifacts_adaptive(artifacts.to_vec(), &selection_policy);
2817    let rendered_context = render_artifacts_context(&selected, &node.context_policy);
2818    let transcript = apply_input_transcript_policy(transcript, &node.transcript_policy);
2819    if node.input_contract.require_transcript && transcript.is_none() {
2820        return Err(VmError::Runtime(format!(
2821            "workflow stage {node_id} requires transcript input"
2822        )));
2823    }
2824    if let Some(min_inputs) = node.input_contract.min_inputs {
2825        if selected.len() < min_inputs {
2826            return Err(VmError::Runtime(format!(
2827                "workflow stage {node_id} requires at least {min_inputs} input artifacts"
2828            )));
2829        }
2830    }
2831    if let Some(max_inputs) = node.input_contract.max_inputs {
2832        if selected.len() > max_inputs {
2833            return Err(VmError::Runtime(format!(
2834                "workflow stage {node_id} accepts at most {max_inputs} input artifacts"
2835            )));
2836        }
2837    }
2838    let prompt = if rendered_context.is_empty() {
2839        task.to_string()
2840    } else {
2841        format!(
2842            "{rendered_context}\n\n{}:\n{task}",
2843            node.task_label
2844                .clone()
2845                .unwrap_or_else(|| "Task".to_string())
2846        )
2847    };
2848
2849    let tool_format = std::env::var("HARN_AGENT_TOOL_FORMAT")
2850        .ok()
2851        .filter(|value| !value.trim().is_empty())
2852        .unwrap_or_else(|| "text".to_string());
2853    let mut llm_result = if node.kind == "verify" {
2854        if let Some(command) = node
2855            .verify
2856            .as_ref()
2857            .and_then(|verify| verify.as_object())
2858            .and_then(|verify| verify.get("command"))
2859            .and_then(|value| value.as_str())
2860            .map(str::trim)
2861            .filter(|value| !value.is_empty())
2862        {
2863            let mut process = if cfg!(target_os = "windows") {
2864                let mut cmd = tokio::process::Command::new("cmd");
2865                cmd.arg("/C").arg(command);
2866                cmd
2867            } else {
2868                let mut cmd = tokio::process::Command::new("/bin/sh");
2869                cmd.arg("-lc").arg(command);
2870                cmd
2871            };
2872            process.stdin(std::process::Stdio::null());
2873            if let Some(context) = crate::stdlib::process::current_execution_context() {
2874                if let Some(cwd) = context.cwd.filter(|cwd| !cwd.is_empty()) {
2875                    process.current_dir(cwd);
2876                }
2877                if !context.env.is_empty() {
2878                    process.envs(context.env);
2879                }
2880            }
2881            let output = process
2882                .output()
2883                .await
2884                .map_err(|e| VmError::Runtime(format!("workflow verify exec failed: {e}")))?;
2885            let stdout = String::from_utf8_lossy(&output.stdout).to_string();
2886            let stderr = String::from_utf8_lossy(&output.stderr).to_string();
2887            let combined = if stderr.is_empty() {
2888                stdout.clone()
2889            } else if stdout.is_empty() {
2890                stderr.clone()
2891            } else {
2892                format!("{stdout}\n{stderr}")
2893            };
2894            serde_json::json!({
2895                "status": "completed",
2896                "text": combined,
2897                "visible_text": combined,
2898                "command": command,
2899                "stdout": stdout,
2900                "stderr": stderr,
2901                "exit_status": output.status.code().unwrap_or(-1),
2902                "success": output.status.success(),
2903            })
2904        } else {
2905            serde_json::json!({
2906                "status": "completed",
2907                "text": "",
2908                "visible_text": "",
2909            })
2910        }
2911    } else {
2912        let mut options = BTreeMap::new();
2913        if let Some(provider) = &node.model_policy.provider {
2914            options.insert(
2915                "provider".to_string(),
2916                VmValue::String(Rc::from(provider.clone())),
2917            );
2918        }
2919        if let Some(model) = &node.model_policy.model {
2920            options.insert(
2921                "model".to_string(),
2922                VmValue::String(Rc::from(model.clone())),
2923            );
2924        }
2925        if let Some(model_tier) = &node.model_policy.model_tier {
2926            options.insert(
2927                "model_tier".to_string(),
2928                VmValue::String(Rc::from(model_tier.clone())),
2929            );
2930        }
2931        if let Some(temperature) = node.model_policy.temperature {
2932            options.insert("temperature".to_string(), VmValue::Float(temperature));
2933        }
2934        if let Some(max_tokens) = node.model_policy.max_tokens {
2935            options.insert("max_tokens".to_string(), VmValue::Int(max_tokens));
2936        }
2937        let tool_names = workflow_tool_names(&node.tools);
2938        let tools_value = node.raw_tools.clone().or_else(|| {
2939            if matches!(node.tools, serde_json::Value::Null) {
2940                None
2941            } else {
2942                Some(crate::stdlib::json_to_vm_value(&node.tools))
2943            }
2944        });
2945        if tools_value.is_some() && !tool_names.is_empty() {
2946            options.insert("tools".to_string(), tools_value.unwrap_or(VmValue::Nil));
2947        }
2948        if let Some(transcript) = transcript.clone() {
2949            options.insert("transcript".to_string(), transcript);
2950        }
2951
2952        let args = vec![
2953            VmValue::String(Rc::from(prompt.clone())),
2954            node.system
2955                .clone()
2956                .map(|s| VmValue::String(Rc::from(s)))
2957                .unwrap_or(VmValue::Nil),
2958            VmValue::Dict(Rc::new(options)),
2959        ];
2960        let mut opts = extract_llm_options(&args)?;
2961
2962        if node.mode.as_deref() == Some("agent") || !tool_names.is_empty() {
2963            crate::llm::run_agent_loop_internal(
2964                &mut opts,
2965                crate::llm::AgentLoopConfig {
2966                    persistent: true,
2967                    max_iterations: 12,
2968                    max_nudges: 3,
2969                    nudge: None,
2970                    done_sentinel: node.done_sentinel.clone(),
2971                    break_unless_phase: None,
2972                    tool_retries: 0,
2973                    tool_backoff_ms: 1000,
2974                    tool_format: tool_format.clone(),
2975                    auto_compact: None,
2976                    context_callback: None,
2977                    policy: None,
2978                    daemon: false,
2979                    llm_retries: 2,
2980                    llm_backoff_ms: 2000,
2981                },
2982            )
2983            .await?
2984        } else {
2985            let result = vm_call_llm_full(&opts).await?;
2986            crate::llm::agent_loop_result_from_llm(&result, opts)
2987        }
2988    };
2989    if let Some(payload) = llm_result.as_object_mut() {
2990        payload.insert("prompt".to_string(), serde_json::json!(prompt));
2991        payload.insert(
2992            "system_prompt".to_string(),
2993            serde_json::json!(node.system.clone().unwrap_or_default()),
2994        );
2995        payload.insert(
2996            "rendered_context".to_string(),
2997            serde_json::json!(rendered_context),
2998        );
2999        payload.insert(
3000            "selected_artifact_ids".to_string(),
3001            serde_json::json!(selected
3002                .iter()
3003                .map(|artifact| artifact.id.clone())
3004                .collect::<Vec<_>>()),
3005        );
3006        payload.insert(
3007            "selected_artifact_titles".to_string(),
3008            serde_json::json!(selected
3009                .iter()
3010                .map(|artifact| artifact.title.clone())
3011                .collect::<Vec<_>>()),
3012        );
3013        payload.insert(
3014            "tool_calling_mode".to_string(),
3015            serde_json::json!(tool_format.clone()),
3016        );
3017    }
3018
3019    let visible_text = llm_result["text"].as_str().unwrap_or_default().to_string();
3020    let transcript = llm_result
3021        .get("transcript")
3022        .cloned()
3023        .map(|value| crate::stdlib::json_to_vm_value(&value));
3024    let transcript = apply_output_transcript_policy(transcript, &node.transcript_policy);
3025    let output_kind = node
3026        .output_contract
3027        .output_kinds
3028        .first()
3029        .cloned()
3030        .unwrap_or_else(|| {
3031            if node.kind == "verify" {
3032                "verification_result".to_string()
3033            } else {
3034                "artifact".to_string()
3035            }
3036        });
3037    let mut metadata = BTreeMap::new();
3038    metadata.insert(
3039        "input_artifact_ids".to_string(),
3040        serde_json::json!(selected
3041            .iter()
3042            .map(|artifact| artifact.id.clone())
3043            .collect::<Vec<_>>()),
3044    );
3045    metadata.insert("node_kind".to_string(), serde_json::json!(node.kind));
3046    let artifact = ArtifactRecord {
3047        type_name: "artifact".to_string(),
3048        id: new_id("artifact"),
3049        kind: output_kind,
3050        title: Some(format!("stage {node_id} output")),
3051        text: Some(visible_text),
3052        data: Some(llm_result.clone()),
3053        source: Some(node_id.to_string()),
3054        created_at: now_rfc3339(),
3055        freshness: Some("fresh".to_string()),
3056        priority: None,
3057        lineage: selected
3058            .iter()
3059            .map(|artifact| artifact.id.clone())
3060            .collect(),
3061        relevance: Some(1.0),
3062        estimated_tokens: None,
3063        stage: Some(node_id.to_string()),
3064        metadata,
3065    }
3066    .normalize();
3067
3068    Ok((llm_result, vec![artifact], transcript))
3069}
3070
3071pub fn next_nodes_for(
3072    graph: &WorkflowGraph,
3073    current: &str,
3074    branch: Option<&str>,
3075) -> Vec<WorkflowEdge> {
3076    let mut matching: Vec<WorkflowEdge> = graph
3077        .edges
3078        .iter()
3079        .filter(|edge| edge.from == current && edge.branch.as_deref() == branch)
3080        .cloned()
3081        .collect();
3082    if matching.is_empty() {
3083        matching = graph
3084            .edges
3085            .iter()
3086            .filter(|edge| edge.from == current && edge.branch.is_none())
3087            .cloned()
3088            .collect();
3089    }
3090    matching
3091}
3092
3093pub fn next_node_for(graph: &WorkflowGraph, current: &str, branch: &str) -> Option<String> {
3094    next_nodes_for(graph, current, Some(branch))
3095        .into_iter()
3096        .next()
3097        .map(|edge| edge.to)
3098}
3099
3100pub fn append_audit_entry(
3101    graph: &mut WorkflowGraph,
3102    op: &str,
3103    node_id: Option<String>,
3104    reason: Option<String>,
3105    metadata: BTreeMap<String, serde_json::Value>,
3106) {
3107    graph.audit_log.push(WorkflowAuditEntry {
3108        id: new_id("audit"),
3109        op: op.to_string(),
3110        node_id,
3111        timestamp: now_rfc3339(),
3112        reason,
3113        metadata,
3114    });
3115}
3116
3117pub fn builtin_ceiling() -> CapabilityPolicy {
3118    CapabilityPolicy {
3119        // Runtime-owned ceiling is capability-based, not product-tool-based.
3120        // Integrators define concrete tool surfaces in workflow graphs / registries.
3121        tools: Vec::new(),
3122        capabilities: BTreeMap::from([
3123            (
3124                "workspace".to_string(),
3125                vec![
3126                    "read_text".to_string(),
3127                    "write_text".to_string(),
3128                    "apply_edit".to_string(),
3129                    "delete".to_string(),
3130                    "exists".to_string(),
3131                    "list".to_string(),
3132                ],
3133            ),
3134            ("process".to_string(), vec!["exec".to_string()]),
3135        ]),
3136        workspace_roots: Vec::new(),
3137        side_effect_level: Some("network".to_string()),
3138        recursion_limit: Some(8),
3139        tool_arg_constraints: Vec::new(),
3140        tool_metadata: BTreeMap::new(),
3141    }
3142}
3143
3144#[cfg(test)]
3145mod tests {
3146    use super::*;
3147
3148    #[test]
3149    fn capability_intersection_rejects_privilege_expansion() {
3150        let ceiling = CapabilityPolicy {
3151            tools: vec!["read".to_string()],
3152            side_effect_level: Some("read_only".to_string()),
3153            recursion_limit: Some(2),
3154            ..Default::default()
3155        };
3156        let requested = CapabilityPolicy {
3157            tools: vec!["read".to_string(), "edit".to_string()],
3158            ..Default::default()
3159        };
3160        let error = ceiling.intersect(&requested).unwrap_err();
3161        assert!(error.contains("host ceiling"));
3162    }
3163
3164    #[test]
3165    fn mutation_session_normalize_fills_defaults() {
3166        let normalized = MutationSessionRecord::default().normalize();
3167        assert!(normalized.session_id.starts_with("session_"));
3168        assert_eq!(normalized.mutation_scope, "read_only");
3169        assert_eq!(normalized.approval_mode, "host_enforced");
3170    }
3171
3172    #[test]
3173    fn install_current_mutation_session_round_trips() {
3174        install_current_mutation_session(Some(MutationSessionRecord {
3175            session_id: "session_test".to_string(),
3176            mutation_scope: "apply_workspace".to_string(),
3177            approval_mode: "explicit".to_string(),
3178            ..Default::default()
3179        }));
3180        let current = current_mutation_session().expect("session installed");
3181        assert_eq!(current.session_id, "session_test");
3182        assert_eq!(current.mutation_scope, "apply_workspace");
3183        assert_eq!(current.approval_mode, "explicit");
3184
3185        install_current_mutation_session(None);
3186        assert!(current_mutation_session().is_none());
3187    }
3188
3189    #[test]
3190    fn active_execution_policy_rejects_unknown_bridge_builtin() {
3191        push_execution_policy(CapabilityPolicy {
3192            tools: vec!["read".to_string()],
3193            capabilities: BTreeMap::from([(
3194                "workspace".to_string(),
3195                vec!["read_text".to_string()],
3196            )]),
3197            side_effect_level: Some("read_only".to_string()),
3198            recursion_limit: Some(1),
3199            ..Default::default()
3200        });
3201        let error = enforce_current_policy_for_bridge_builtin("custom_host_builtin").unwrap_err();
3202        pop_execution_policy();
3203        assert!(matches!(
3204            error,
3205            VmError::CategorizedError {
3206                category: crate::value::ErrorCategory::ToolRejected,
3207                ..
3208            }
3209        ));
3210    }
3211
3212    #[test]
3213    fn active_execution_policy_rejects_mcp_escape_hatch() {
3214        push_execution_policy(CapabilityPolicy {
3215            tools: vec!["read".to_string()],
3216            capabilities: BTreeMap::from([(
3217                "workspace".to_string(),
3218                vec!["read_text".to_string()],
3219            )]),
3220            side_effect_level: Some("read_only".to_string()),
3221            recursion_limit: Some(1),
3222            ..Default::default()
3223        });
3224        let error = enforce_current_policy_for_builtin("mcp_connect", &[]).unwrap_err();
3225        pop_execution_policy();
3226        assert!(matches!(
3227            error,
3228            VmError::CategorizedError {
3229                category: crate::value::ErrorCategory::ToolRejected,
3230                ..
3231            }
3232        ));
3233    }
3234
3235    #[test]
3236    fn workflow_normalization_upgrades_legacy_act_verify_repair_shape() {
3237        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3238            "name": "legacy",
3239            "act": {"mode": "llm"},
3240            "verify": {"kind": "verify"},
3241            "repair": {"mode": "agent"},
3242        }));
3243        let graph = normalize_workflow_value(&value).unwrap();
3244        assert_eq!(graph.type_name, "workflow_graph");
3245        assert!(graph.nodes.contains_key("act"));
3246        assert!(graph.nodes.contains_key("verify"));
3247        assert!(graph.nodes.contains_key("repair"));
3248        assert_eq!(graph.entry, "act");
3249    }
3250
3251    #[test]
3252    fn workflow_normalization_accepts_tool_registry_nodes() {
3253        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3254            "name": "registry_tools",
3255            "entry": "implement",
3256            "nodes": {
3257                "implement": {
3258                    "kind": "stage",
3259                    "mode": "agent",
3260                    "tools": {
3261                        "_type": "tool_registry",
3262                        "tools": [
3263                            {"name": "read", "description": "Read files"},
3264                            {"name": "run", "description": "Run commands"}
3265                        ]
3266                    }
3267                }
3268            },
3269            "edges": []
3270        }));
3271        let graph = normalize_workflow_value(&value).unwrap();
3272        let node = graph.nodes.get("implement").unwrap();
3273        assert_eq!(workflow_tool_names(&node.tools), vec!["read", "run"]);
3274    }
3275
3276    #[test]
3277    fn artifact_selection_honors_budget_and_priority() {
3278        let policy = ContextPolicy {
3279            max_artifacts: Some(2),
3280            max_tokens: Some(30),
3281            prefer_recent: true,
3282            prefer_fresh: true,
3283            prioritize_kinds: vec!["verification_result".to_string()],
3284            ..Default::default()
3285        };
3286        let artifacts = vec![
3287            ArtifactRecord {
3288                type_name: "artifact".to_string(),
3289                id: "a".to_string(),
3290                kind: "summary".to_string(),
3291                text: Some("short".to_string()),
3292                relevance: Some(0.9),
3293                created_at: now_rfc3339(),
3294                ..Default::default()
3295            }
3296            .normalize(),
3297            ArtifactRecord {
3298                type_name: "artifact".to_string(),
3299                id: "b".to_string(),
3300                kind: "summary".to_string(),
3301                text: Some("this is a much larger artifact body".to_string()),
3302                relevance: Some(1.0),
3303                created_at: now_rfc3339(),
3304                ..Default::default()
3305            }
3306            .normalize(),
3307            ArtifactRecord {
3308                type_name: "artifact".to_string(),
3309                id: "c".to_string(),
3310                kind: "summary".to_string(),
3311                text: Some("tiny".to_string()),
3312                relevance: Some(0.5),
3313                created_at: now_rfc3339(),
3314                ..Default::default()
3315            }
3316            .normalize(),
3317        ];
3318        let selected = select_artifacts(artifacts, &policy);
3319        assert_eq!(selected.len(), 2);
3320        assert!(selected.iter().all(|artifact| artifact.kind == "summary"));
3321    }
3322
3323    #[test]
3324    fn workflow_validation_rejects_condition_without_true_false_edges() {
3325        let graph = WorkflowGraph {
3326            entry: "gate".to_string(),
3327            nodes: BTreeMap::from([(
3328                "gate".to_string(),
3329                WorkflowNode {
3330                    id: Some("gate".to_string()),
3331                    kind: "condition".to_string(),
3332                    ..Default::default()
3333                },
3334            )]),
3335            edges: vec![WorkflowEdge {
3336                from: "gate".to_string(),
3337                to: "next".to_string(),
3338                branch: Some("true".to_string()),
3339                label: None,
3340            }],
3341            ..Default::default()
3342        };
3343        let report = validate_workflow(&graph, None);
3344        assert!(!report.valid);
3345        assert!(report
3346            .errors
3347            .iter()
3348            .any(|error| error.contains("true") && error.contains("false")));
3349    }
3350
3351    #[test]
3352    fn replay_fixture_round_trip_passes() {
3353        let run = RunRecord {
3354            type_name: "run_record".to_string(),
3355            id: "run_1".to_string(),
3356            workflow_id: "wf".to_string(),
3357            workflow_name: Some("demo".to_string()),
3358            task: "demo".to_string(),
3359            status: "completed".to_string(),
3360            started_at: "1".to_string(),
3361            finished_at: Some("2".to_string()),
3362            parent_run_id: None,
3363            root_run_id: Some("run_1".to_string()),
3364            stages: vec![RunStageRecord {
3365                id: "stage_1".to_string(),
3366                node_id: "act".to_string(),
3367                kind: "stage".to_string(),
3368                status: "completed".to_string(),
3369                outcome: "success".to_string(),
3370                branch: Some("success".to_string()),
3371                started_at: "1".to_string(),
3372                finished_at: Some("2".to_string()),
3373                visible_text: Some("done".to_string()),
3374                private_reasoning: None,
3375                transcript: None,
3376                verification: None,
3377                usage: None,
3378                artifacts: vec![ArtifactRecord {
3379                    type_name: "artifact".to_string(),
3380                    id: "a1".to_string(),
3381                    kind: "summary".to_string(),
3382                    text: Some("done".to_string()),
3383                    created_at: "1".to_string(),
3384                    ..Default::default()
3385                }
3386                .normalize()],
3387                consumed_artifact_ids: vec![],
3388                produced_artifact_ids: vec!["a1".to_string()],
3389                attempts: vec![],
3390                metadata: BTreeMap::new(),
3391            }],
3392            transitions: vec![],
3393            checkpoints: vec![],
3394            pending_nodes: vec![],
3395            completed_nodes: vec!["act".to_string()],
3396            child_runs: vec![],
3397            artifacts: vec![],
3398            policy: CapabilityPolicy::default(),
3399            execution: None,
3400            transcript: None,
3401            usage: None,
3402            replay_fixture: None,
3403            trace_spans: vec![],
3404            metadata: BTreeMap::new(),
3405            persisted_path: None,
3406        };
3407        let fixture = replay_fixture_from_run(&run);
3408        let report = evaluate_run_against_fixture(&run, &fixture);
3409        assert!(report.pass);
3410        assert!(report.failures.is_empty());
3411    }
3412
3413    #[test]
3414    fn replay_eval_suite_reports_failed_case() {
3415        let good = RunRecord {
3416            id: "run_good".to_string(),
3417            workflow_id: "wf".to_string(),
3418            status: "completed".to_string(),
3419            stages: vec![RunStageRecord {
3420                node_id: "act".to_string(),
3421                status: "completed".to_string(),
3422                outcome: "success".to_string(),
3423                ..Default::default()
3424            }],
3425            ..Default::default()
3426        };
3427        let bad = RunRecord {
3428            id: "run_bad".to_string(),
3429            workflow_id: "wf".to_string(),
3430            status: "failed".to_string(),
3431            stages: vec![RunStageRecord {
3432                node_id: "act".to_string(),
3433                status: "failed".to_string(),
3434                outcome: "error".to_string(),
3435                ..Default::default()
3436            }],
3437            ..Default::default()
3438        };
3439        let suite = evaluate_run_suite(vec![
3440            (
3441                good.clone(),
3442                replay_fixture_from_run(&good),
3443                Some("good.json".to_string()),
3444            ),
3445            (
3446                bad.clone(),
3447                replay_fixture_from_run(&good),
3448                Some("bad.json".to_string()),
3449            ),
3450        ]);
3451        assert!(!suite.pass);
3452        assert_eq!(suite.total, 2);
3453        assert_eq!(suite.failed, 1);
3454        assert!(suite.cases.iter().any(|case| !case.pass));
3455    }
3456
3457    #[test]
3458    fn run_diff_reports_changed_stage() {
3459        let left = RunRecord {
3460            id: "left".to_string(),
3461            workflow_id: "wf".to_string(),
3462            status: "completed".to_string(),
3463            stages: vec![RunStageRecord {
3464                node_id: "act".to_string(),
3465                status: "completed".to_string(),
3466                outcome: "success".to_string(),
3467                ..Default::default()
3468            }],
3469            ..Default::default()
3470        };
3471        let right = RunRecord {
3472            id: "right".to_string(),
3473            workflow_id: "wf".to_string(),
3474            status: "failed".to_string(),
3475            stages: vec![RunStageRecord {
3476                node_id: "act".to_string(),
3477                status: "failed".to_string(),
3478                outcome: "error".to_string(),
3479                ..Default::default()
3480            }],
3481            ..Default::default()
3482        };
3483        let diff = diff_run_records(&left, &right);
3484        assert!(diff.status_changed);
3485        assert!(!diff.identical);
3486        assert_eq!(diff.stage_diffs.len(), 1);
3487    }
3488
3489    #[test]
3490    fn eval_suite_manifest_can_fail_on_baseline_diff() {
3491        let temp_dir =
3492            std::env::temp_dir().join(format!("harn-eval-suite-{}", uuid::Uuid::now_v7()));
3493        std::fs::create_dir_all(&temp_dir).unwrap();
3494        let baseline_path = temp_dir.join("baseline.json");
3495        let candidate_path = temp_dir.join("candidate.json");
3496
3497        let baseline = RunRecord {
3498            id: "baseline".to_string(),
3499            workflow_id: "wf".to_string(),
3500            status: "completed".to_string(),
3501            stages: vec![RunStageRecord {
3502                node_id: "act".to_string(),
3503                status: "completed".to_string(),
3504                outcome: "success".to_string(),
3505                ..Default::default()
3506            }],
3507            ..Default::default()
3508        };
3509        let candidate = RunRecord {
3510            id: "candidate".to_string(),
3511            workflow_id: "wf".to_string(),
3512            status: "failed".to_string(),
3513            stages: vec![RunStageRecord {
3514                node_id: "act".to_string(),
3515                status: "failed".to_string(),
3516                outcome: "error".to_string(),
3517                ..Default::default()
3518            }],
3519            ..Default::default()
3520        };
3521
3522        save_run_record(&baseline, Some(baseline_path.to_str().unwrap())).unwrap();
3523        save_run_record(&candidate, Some(candidate_path.to_str().unwrap())).unwrap();
3524
3525        let manifest = EvalSuiteManifest {
3526            base_dir: Some(temp_dir.display().to_string()),
3527            cases: vec![EvalSuiteCase {
3528                label: Some("candidate".to_string()),
3529                run_path: "candidate.json".to_string(),
3530                fixture_path: None,
3531                compare_to: Some("baseline.json".to_string()),
3532            }],
3533            ..Default::default()
3534        };
3535        let suite = evaluate_run_suite_manifest(&manifest).unwrap();
3536        assert!(!suite.pass);
3537        assert_eq!(suite.failed, 1);
3538        assert!(suite.cases[0].comparison.is_some());
3539        assert!(suite.cases[0]
3540            .failures
3541            .iter()
3542            .any(|failure| failure.contains("baseline")));
3543    }
3544
3545    #[test]
3546    fn render_unified_diff_marks_removed_and_added_lines() {
3547        let diff = render_unified_diff(Some("src/main.rs"), "old\nsame", "new\nsame");
3548        assert!(diff.contains("--- a/src/main.rs"));
3549        assert!(diff.contains("+++ b/src/main.rs"));
3550        assert!(diff.contains("-old"));
3551        assert!(diff.contains("+new"));
3552        assert!(diff.contains(" same"));
3553    }
3554
3555    #[test]
3556    fn execution_policy_rejects_process_exec_when_read_only() {
3557        push_execution_policy(CapabilityPolicy {
3558            side_effect_level: Some("read_only".to_string()),
3559            capabilities: BTreeMap::from([("process".to_string(), vec!["exec".to_string()])]),
3560            ..Default::default()
3561        });
3562        let result = enforce_current_policy_for_builtin("exec", &[]);
3563        pop_execution_policy();
3564        assert!(result.is_err());
3565    }
3566
3567    #[test]
3568    fn execution_policy_rejects_unlisted_tool() {
3569        push_execution_policy(CapabilityPolicy {
3570            tools: vec!["read".to_string()],
3571            ..Default::default()
3572        });
3573        let result = enforce_current_policy_for_tool("edit");
3574        pop_execution_policy();
3575        assert!(result.is_err());
3576    }
3577
3578    #[test]
3579    fn normalize_run_record_preserves_trace_spans() {
3580        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3581            "_type": "run_record",
3582            "id": "run_trace",
3583            "workflow_id": "wf",
3584            "status": "completed",
3585            "started_at": "1",
3586            "trace_spans": [
3587                {
3588                    "span_id": 1,
3589                    "parent_id": null,
3590                    "kind": "pipeline",
3591                    "name": "workflow",
3592                    "start_ms": 0,
3593                    "duration_ms": 42,
3594                    "metadata": {"model": "demo"}
3595                }
3596            ]
3597        }));
3598
3599        let run = normalize_run_record(&value).unwrap();
3600        assert_eq!(run.trace_spans.len(), 1);
3601        assert_eq!(run.trace_spans[0].kind, "pipeline");
3602        assert_eq!(
3603            run.trace_spans[0].metadata["model"],
3604            serde_json::json!("demo")
3605        );
3606    }
3607
3608    // ── Tool hook tests ──────────────────────────────────────────────
3609
3610    #[test]
3611    fn pre_tool_hook_deny_blocks_execution() {
3612        clear_tool_hooks();
3613        register_tool_hook(ToolHook {
3614            pattern: "dangerous_*".to_string(),
3615            pre: Some(Rc::new(|_name, _args| {
3616                PreToolAction::Deny("blocked by policy".to_string())
3617            })),
3618            post: None,
3619        });
3620        let result = run_pre_tool_hooks("dangerous_delete", &serde_json::json!({}));
3621        clear_tool_hooks();
3622        assert!(matches!(result, PreToolAction::Deny(_)));
3623    }
3624
3625    #[test]
3626    fn pre_tool_hook_allow_passes_through() {
3627        clear_tool_hooks();
3628        register_tool_hook(ToolHook {
3629            pattern: "safe_*".to_string(),
3630            pre: Some(Rc::new(|_name, _args| PreToolAction::Allow)),
3631            post: None,
3632        });
3633        let result = run_pre_tool_hooks("safe_read", &serde_json::json!({}));
3634        clear_tool_hooks();
3635        assert!(matches!(result, PreToolAction::Allow));
3636    }
3637
3638    #[test]
3639    fn pre_tool_hook_modify_rewrites_args() {
3640        clear_tool_hooks();
3641        register_tool_hook(ToolHook {
3642            pattern: "*".to_string(),
3643            pre: Some(Rc::new(|_name, _args| {
3644                PreToolAction::Modify(serde_json::json!({"path": "/sanitized"}))
3645            })),
3646            post: None,
3647        });
3648        let result = run_pre_tool_hooks("read_file", &serde_json::json!({"path": "/etc/passwd"}));
3649        clear_tool_hooks();
3650        match result {
3651            PreToolAction::Modify(args) => assert_eq!(args["path"], "/sanitized"),
3652            _ => panic!("expected Modify"),
3653        }
3654    }
3655
3656    #[test]
3657    fn post_tool_hook_modifies_result() {
3658        clear_tool_hooks();
3659        register_tool_hook(ToolHook {
3660            pattern: "exec".to_string(),
3661            pre: None,
3662            post: Some(Rc::new(|_name, result| {
3663                if result.contains("SECRET") {
3664                    PostToolAction::Modify("[REDACTED]".to_string())
3665                } else {
3666                    PostToolAction::Pass
3667                }
3668            })),
3669        });
3670        let result = run_post_tool_hooks("exec", "output with SECRET data");
3671        let clean = run_post_tool_hooks("exec", "clean output");
3672        clear_tool_hooks();
3673        assert_eq!(result, "[REDACTED]");
3674        assert_eq!(clean, "clean output");
3675    }
3676
3677    #[test]
3678    fn unmatched_hook_pattern_does_not_fire() {
3679        clear_tool_hooks();
3680        register_tool_hook(ToolHook {
3681            pattern: "exec".to_string(),
3682            pre: Some(Rc::new(|_name, _args| {
3683                PreToolAction::Deny("should not match".to_string())
3684            })),
3685            post: None,
3686        });
3687        let result = run_pre_tool_hooks("read_file", &serde_json::json!({}));
3688        clear_tool_hooks();
3689        assert!(matches!(result, PreToolAction::Allow));
3690    }
3691
3692    #[test]
3693    fn glob_match_patterns() {
3694        assert!(glob_match("*", "anything"));
3695        assert!(glob_match("exec*", "exec_at"));
3696        assert!(glob_match("*_file", "read_file"));
3697        assert!(!glob_match("exec*", "read_file"));
3698        assert!(glob_match("read_file", "read_file"));
3699        assert!(!glob_match("read_file", "write_file"));
3700    }
3701
3702    // ── Auto-compaction tests ────────────────────────────────────────
3703
3704    #[test]
3705    fn microcompact_snips_large_output() {
3706        let large = "x".repeat(50_000);
3707        let result = microcompact_tool_output(&large, 10_000);
3708        assert!(result.len() < 15_000);
3709        assert!(result.contains("snipped"));
3710    }
3711
3712    #[test]
3713    fn microcompact_preserves_small_output() {
3714        let small = "hello world";
3715        let result = microcompact_tool_output(small, 10_000);
3716        assert_eq!(result, small);
3717    }
3718
3719    #[test]
3720    fn microcompact_preserves_strong_keyword_lines_without_file_line() {
3721        // Regression: diagnostic extraction used to require both a
3722        // file:line reference AND a keyword. Strong keywords like "FAIL"
3723        // and "panic" should preserve the line on their own, because they
3724        // carry signal even when they appear on narrative lines (Go's
3725        // "--- FAIL: TestName", Rust's "thread '...' panicked at ...",
3726        // pytest's "FAILED tests/..."). The exact patterns are language-
3727        // specific and don't belong in the VM — but the generic rule
3728        // "strong keywords count even without file:line" does.
3729        let mut output = String::new();
3730        for i in 0..100 {
3731            output.push_str(&format!("verbose progress line {i}\n"));
3732        }
3733        output.push_str("--- FAIL: TestEmpty (0.00s)\n");
3734        output.push_str("thread 'tests::test_foo' panicked at src/lib.rs:42:5\n");
3735        output.push_str("FAILED tests/test_parser.py::test_empty\n");
3736        for i in 0..100 {
3737            output.push_str(&format!("more output after failures {i}\n"));
3738        }
3739        let result = microcompact_tool_output(&output, 2_000);
3740        assert!(
3741            result.contains("--- FAIL: TestEmpty"),
3742            "strong 'FAIL' keyword should preserve the line:\n{result}"
3743        );
3744        assert!(
3745            result.contains("panicked at"),
3746            "strong 'panic' keyword should preserve the line:\n{result}"
3747        );
3748        assert!(
3749            result.contains("FAILED tests/test_parser.py"),
3750            "strong 'FAIL' keyword should preserve pytest-style lines too:\n{result}"
3751        );
3752    }
3753
3754    #[test]
3755    fn auto_compact_messages_reduces_count() {
3756        let mut messages: Vec<serde_json::Value> = (0..20)
3757            .map(|i| serde_json::json!({"role": "user", "content": format!("message {i}")}))
3758            .collect();
3759        let runtime = tokio::runtime::Builder::new_current_thread()
3760            .enable_all()
3761            .build()
3762            .unwrap();
3763        let compacted = runtime.block_on(auto_compact_messages(
3764            &mut messages,
3765            &AutoCompactConfig {
3766                compact_strategy: CompactStrategy::Truncate,
3767                keep_last: 6,
3768                ..Default::default()
3769            },
3770            None,
3771        ));
3772        let summary = compacted.unwrap();
3773        assert!(summary.is_some());
3774        assert!(messages.len() <= 7); // 6 kept + 1 summary
3775        assert!(messages[0]["content"]
3776            .as_str()
3777            .unwrap()
3778            .contains("auto-compacted"));
3779    }
3780
3781    #[test]
3782    fn auto_compact_noop_when_under_threshold() {
3783        let mut messages: Vec<serde_json::Value> = (0..4)
3784            .map(|i| serde_json::json!({"role": "user", "content": format!("msg {i}")}))
3785            .collect();
3786        let runtime = tokio::runtime::Builder::new_current_thread()
3787            .enable_all()
3788            .build()
3789            .unwrap();
3790        let compacted = runtime.block_on(auto_compact_messages(
3791            &mut messages,
3792            &AutoCompactConfig {
3793                compact_strategy: CompactStrategy::Truncate,
3794                keep_last: 6,
3795                ..Default::default()
3796            },
3797            None,
3798        ));
3799        assert!(compacted.unwrap().is_none());
3800        assert_eq!(messages.len(), 4);
3801    }
3802
3803    #[test]
3804    fn estimate_message_tokens_basic() {
3805        let messages = vec![
3806            serde_json::json!({"role": "user", "content": "a".repeat(400)}),
3807            serde_json::json!({"role": "assistant", "content": "b".repeat(400)}),
3808        ];
3809        let tokens = estimate_message_tokens(&messages);
3810        assert_eq!(tokens, 200); // 800 chars / 4
3811    }
3812
3813    // ── Artifact dedup and microcompaction tests ─────────────────────
3814
3815    #[test]
3816    fn dedup_artifacts_removes_duplicates() {
3817        let mut artifacts = vec![
3818            ArtifactRecord {
3819                id: "a1".to_string(),
3820                kind: "test".to_string(),
3821                text: Some("duplicate content".to_string()),
3822                ..Default::default()
3823            },
3824            ArtifactRecord {
3825                id: "a2".to_string(),
3826                kind: "test".to_string(),
3827                text: Some("duplicate content".to_string()),
3828                ..Default::default()
3829            },
3830            ArtifactRecord {
3831                id: "a3".to_string(),
3832                kind: "test".to_string(),
3833                text: Some("unique content".to_string()),
3834                ..Default::default()
3835            },
3836        ];
3837        dedup_artifacts(&mut artifacts);
3838        assert_eq!(artifacts.len(), 2);
3839    }
3840
3841    #[test]
3842    fn microcompact_artifact_snips_oversized() {
3843        let mut artifact = ArtifactRecord {
3844            id: "a1".to_string(),
3845            kind: "test".to_string(),
3846            text: Some("x".repeat(10_000)),
3847            estimated_tokens: Some(2_500),
3848            ..Default::default()
3849        };
3850        microcompact_artifact(&mut artifact, 500);
3851        assert!(artifact.text.as_ref().unwrap().len() < 5_000);
3852        assert_eq!(artifact.estimated_tokens, Some(500));
3853    }
3854
3855    // ── Tool argument constraint tests ───────────────────────────────
3856
3857    #[test]
3858    fn arg_constraint_allows_matching_pattern() {
3859        let policy = CapabilityPolicy {
3860            tool_arg_constraints: vec![ToolArgConstraint {
3861                tool: "exec".to_string(),
3862                arg_patterns: vec!["cargo *".to_string()],
3863            }],
3864            ..Default::default()
3865        };
3866        let result = enforce_tool_arg_constraints(
3867            &policy,
3868            "exec",
3869            &serde_json::json!({"command": "cargo test"}),
3870        );
3871        assert!(result.is_ok());
3872    }
3873
3874    #[test]
3875    fn arg_constraint_rejects_non_matching_pattern() {
3876        let policy = CapabilityPolicy {
3877            tool_arg_constraints: vec![ToolArgConstraint {
3878                tool: "exec".to_string(),
3879                arg_patterns: vec!["cargo *".to_string()],
3880            }],
3881            ..Default::default()
3882        };
3883        let result = enforce_tool_arg_constraints(
3884            &policy,
3885            "exec",
3886            &serde_json::json!({"command": "rm -rf /"}),
3887        );
3888        assert!(result.is_err());
3889    }
3890
3891    #[test]
3892    fn arg_constraint_ignores_unmatched_tool() {
3893        let policy = CapabilityPolicy {
3894            tool_arg_constraints: vec![ToolArgConstraint {
3895                tool: "exec".to_string(),
3896                arg_patterns: vec!["cargo *".to_string()],
3897            }],
3898            ..Default::default()
3899        };
3900        let result = enforce_tool_arg_constraints(
3901            &policy,
3902            "read_file",
3903            &serde_json::json!({"path": "/etc/passwd"}),
3904        );
3905        assert!(result.is_ok());
3906    }
3907
3908    #[test]
3909    fn microcompact_handles_multibyte_utf8() {
3910        // Emoji are 4 bytes each — slicing at arbitrary byte offsets would panic
3911        let emoji_output = "🔥".repeat(500); // 2000 bytes, 500 chars
3912        let result = microcompact_tool_output(&emoji_output, 400);
3913        // Should not panic and should contain the snip marker
3914        assert!(result.contains("snipped"));
3915
3916        // Mixed ASCII + multi-byte
3917        let mixed = format!("{}{}{}", "a".repeat(300), "é".repeat(500), "b".repeat(300));
3918        let result2 = microcompact_tool_output(&mixed, 400);
3919        assert!(result2.contains("snipped"));
3920
3921        // CJK characters (3 bytes each)
3922        let cjk = "中文".repeat(500);
3923        let result3 = microcompact_tool_output(&cjk, 400);
3924        assert!(result3.contains("snipped"));
3925    }
3926}
harn_vm/orchestration.rs

harn_vm/
orchestration.rs