harn_vm/
orchestration.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3use std::rc::Rc;
4use std::{cell::RefCell, thread_local};
5
6use serde::{Deserialize, Serialize};
7
8use crate::llm::{extract_llm_options, vm_call_llm_full, vm_value_to_json};
9use crate::value::{VmError, VmValue};
10
11fn now_rfc3339() -> String {
12    use std::time::{SystemTime, UNIX_EPOCH};
13    let ts = SystemTime::now()
14        .duration_since(UNIX_EPOCH)
15        .unwrap_or_default()
16        .as_secs();
17    format!("{ts}")
18}
19
20fn new_id(prefix: &str) -> String {
21    format!("{prefix}_{}", uuid::Uuid::now_v7())
22}
23
24fn default_run_dir() -> PathBuf {
25    std::env::var("HARN_RUN_DIR")
26        .map(PathBuf::from)
27        .unwrap_or_else(|_| PathBuf::from(".harn-runs"))
28}
29
30thread_local! {
31    static EXECUTION_POLICY_STACK: RefCell<Vec<CapabilityPolicy>> = const { RefCell::new(Vec::new()) };
32    static TOOL_HOOKS: RefCell<Vec<ToolHook>> = const { RefCell::new(Vec::new()) };
33    static CURRENT_MUTATION_SESSION: RefCell<Option<MutationSessionRecord>> = const { RefCell::new(None) };
34}
35
36#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
37#[serde(default)]
38pub struct MutationSessionRecord {
39    pub session_id: String,
40    pub parent_session_id: Option<String>,
41    pub run_id: Option<String>,
42    pub worker_id: Option<String>,
43    pub execution_kind: Option<String>,
44    pub mutation_scope: String,
45    pub approval_mode: String,
46}
47
48impl MutationSessionRecord {
49    pub fn normalize(mut self) -> Self {
50        if self.session_id.is_empty() {
51            self.session_id = new_id("session");
52        }
53        if self.mutation_scope.is_empty() {
54            self.mutation_scope = "read_only".to_string();
55        }
56        if self.approval_mode.is_empty() {
57            self.approval_mode = "host_enforced".to_string();
58        }
59        self
60    }
61}
62
63pub fn install_current_mutation_session(session: Option<MutationSessionRecord>) {
64    CURRENT_MUTATION_SESSION.with(|slot| {
65        *slot.borrow_mut() = session.map(MutationSessionRecord::normalize);
66    });
67}
68
69pub fn current_mutation_session() -> Option<MutationSessionRecord> {
70    CURRENT_MUTATION_SESSION.with(|slot| slot.borrow().clone())
71}
72
73// ── Tool lifecycle hooks ──────────────────────────────────────────────
74
75/// Action returned by a PreToolUse hook.
76#[derive(Clone, Debug)]
77pub enum PreToolAction {
78    /// Allow the tool call to proceed unchanged.
79    Allow,
80    /// Deny the tool call with an explanation.
81    Deny(String),
82    /// Allow but replace the arguments.
83    Modify(serde_json::Value),
84}
85
86/// Action returned by a PostToolUse hook.
87#[derive(Clone, Debug)]
88pub enum PostToolAction {
89    /// Pass the result through unchanged.
90    Pass,
91    /// Replace the result text.
92    Modify(String),
93}
94
95/// Callback types for tool lifecycle hooks.
96pub type PreToolHookFn = Rc<dyn Fn(&str, &serde_json::Value) -> PreToolAction>;
97pub type PostToolHookFn = Rc<dyn Fn(&str, &str) -> PostToolAction>;
98
99/// A registered tool hook with a name pattern and callbacks.
100#[derive(Clone)]
101pub struct ToolHook {
102    /// Glob-style pattern matched against tool names (e.g. `"*"`, `"exec*"`, `"read_file"`).
103    pub pattern: String,
104    /// Called before tool execution. Return `Deny` to reject, `Modify` to rewrite args.
105    pub pre: Option<PreToolHookFn>,
106    /// Called after tool execution with the result text. Return `Modify` to rewrite.
107    pub post: Option<PostToolHookFn>,
108}
109
110impl std::fmt::Debug for ToolHook {
111    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112        f.debug_struct("ToolHook")
113            .field("pattern", &self.pattern)
114            .field("has_pre", &self.pre.is_some())
115            .field("has_post", &self.post.is_some())
116            .finish()
117    }
118}
119
120fn glob_match(pattern: &str, name: &str) -> bool {
121    if pattern == "*" {
122        return true;
123    }
124    if let Some(prefix) = pattern.strip_suffix('*') {
125        return name.starts_with(prefix);
126    }
127    if let Some(suffix) = pattern.strip_prefix('*') {
128        return name.ends_with(suffix);
129    }
130    pattern == name
131}
132
133pub fn register_tool_hook(hook: ToolHook) {
134    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().push(hook));
135}
136
137pub fn clear_tool_hooks() {
138    TOOL_HOOKS.with(|hooks| hooks.borrow_mut().clear());
139}
140
141/// Run all matching PreToolUse hooks. Returns the final action.
142pub fn run_pre_tool_hooks(tool_name: &str, args: &serde_json::Value) -> PreToolAction {
143    TOOL_HOOKS.with(|hooks| {
144        let hooks = hooks.borrow();
145        let mut current_args = args.clone();
146        for hook in hooks.iter() {
147            if !glob_match(&hook.pattern, tool_name) {
148                continue;
149            }
150            if let Some(ref pre) = hook.pre {
151                match pre(tool_name, &current_args) {
152                    PreToolAction::Allow => {}
153                    PreToolAction::Deny(reason) => return PreToolAction::Deny(reason),
154                    PreToolAction::Modify(new_args) => {
155                        current_args = new_args;
156                    }
157                }
158            }
159        }
160        if current_args != *args {
161            PreToolAction::Modify(current_args)
162        } else {
163            PreToolAction::Allow
164        }
165    })
166}
167
168/// Run all matching PostToolUse hooks. Returns the (possibly modified) result.
169pub fn run_post_tool_hooks(tool_name: &str, result: &str) -> String {
170    TOOL_HOOKS.with(|hooks| {
171        let hooks = hooks.borrow();
172        let mut current = result.to_string();
173        for hook in hooks.iter() {
174            if !glob_match(&hook.pattern, tool_name) {
175                continue;
176            }
177            if let Some(ref post) = hook.post {
178                match post(tool_name, &current) {
179                    PostToolAction::Pass => {}
180                    PostToolAction::Modify(new_result) => {
181                        current = new_result;
182                    }
183                }
184            }
185        }
186        current
187    })
188}
189
190// ── Auto-compaction ───────────────────────────────────────────────────
191
192#[derive(Clone, Debug, PartialEq, Eq)]
193pub enum CompactStrategy {
194    Llm,
195    Truncate,
196    Custom,
197}
198
199pub fn parse_compact_strategy(value: &str) -> Result<CompactStrategy, VmError> {
200    match value {
201        "llm" => Ok(CompactStrategy::Llm),
202        "truncate" => Ok(CompactStrategy::Truncate),
203        "custom" => Ok(CompactStrategy::Custom),
204        other => Err(VmError::Runtime(format!(
205            "unknown compact_strategy '{other}' (expected 'llm', 'truncate', or 'custom')"
206        ))),
207    }
208}
209
210/// Configuration for automatic transcript compaction in agent loops.
211#[derive(Clone, Debug)]
212pub struct AutoCompactConfig {
213    /// Maximum estimated tokens before triggering auto-compaction.
214    pub token_threshold: usize,
215    /// Maximum character length for a single tool result before microcompaction.
216    pub tool_output_max_chars: usize,
217    /// Number of recent messages to keep during auto-compaction.
218    pub keep_last: usize,
219    /// Strategy used to summarize archived messages.
220    pub compact_strategy: CompactStrategy,
221    /// Optional Harn callback used when `compact_strategy` is `custom`.
222    pub custom_compactor: Option<VmValue>,
223}
224
225impl Default for AutoCompactConfig {
226    fn default() -> Self {
227        Self {
228            token_threshold: 80_000,
229            tool_output_max_chars: 20_000,
230            keep_last: 8,
231            compact_strategy: CompactStrategy::Llm,
232            custom_compactor: None,
233        }
234    }
235}
236
237/// Estimate token count from a list of JSON messages (chars / 4 heuristic).
238pub fn estimate_message_tokens(messages: &[serde_json::Value]) -> usize {
239    messages
240        .iter()
241        .map(|m| {
242            m.get("content")
243                .and_then(|c| c.as_str())
244                .map(|s| s.len())
245                .unwrap_or(0)
246        })
247        .sum::<usize>()
248        / 4
249}
250
251/// Microcompact a tool result: if it exceeds `max_chars`, keep the first and
252/// last portions with a snip marker in between.
253pub fn microcompact_tool_output(output: &str, max_chars: usize) -> String {
254    if output.len() <= max_chars || max_chars < 200 {
255        return output.to_string();
256    }
257    let diagnostic_lines = output
258        .lines()
259        .filter(|line| {
260            let trimmed = line.trim();
261            let lower = trimmed.to_lowercase();
262            // file:line pattern (e.g. "src/main.rs:42:" or "foo.go:10:5:")
263            let has_file_line = {
264                let bytes = trimmed.as_bytes();
265                let mut i = 0;
266                let mut found_colon = false;
267                while i < bytes.len() {
268                    if bytes[i] == b':' {
269                        found_colon = true;
270                        break;
271                    }
272                    i += 1;
273                }
274                found_colon && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit()
275            };
276            // Generic keyword classification. Split into "strong" keywords
277            // whose presence alone signals a diagnostic line (regardless of
278            // whether the line also has a file:line reference), and "weak"
279            // keywords which only count as diagnostic when paired with a
280            // file:line (to avoid false positives on narrative prose that
281            // happens to contain the word "error" or "expected").
282            //
283            // These are deliberately generic — not tied to any specific
284            // language's test runner output format. Language-specific
285            // patterns (Go's "--- FAIL:", pytest's "FAILED tests/", Rust's
286            // "thread 'X' panicked at") should be supplied by the pipeline
287            // via the `extra_diagnostic_patterns` auto_compact option,
288            // which is where language/runner awareness belongs.
289            let has_strong_keyword =
290                trimmed.contains("FAIL") || trimmed.contains("panic") || trimmed.contains("Panic");
291            let has_weak_keyword = trimmed.contains("error")
292                || trimmed.contains("undefined")
293                || trimmed.contains("expected")
294                || trimmed.contains("got")
295                || lower.contains("cannot find")
296                || lower.contains("not found")
297                || lower.contains("no such")
298                || lower.contains("unresolved")
299                || lower.contains("missing")
300                || lower.contains("declared but not used")
301                || lower.contains("unused")
302                || lower.contains("mismatch");
303            let positional = lower.contains(" error ")
304                || lower.starts_with("error:")
305                || lower.starts_with("warning:")
306                || lower.starts_with("note:")
307                || lower.contains("panic:");
308            has_strong_keyword || (has_file_line && has_weak_keyword) || positional
309        })
310        .take(32)
311        .collect::<Vec<_>>();
312    if !diagnostic_lines.is_empty() {
313        let diagnostics = diagnostic_lines.join("\n");
314        let budget = max_chars.saturating_sub(diagnostics.len() + 64);
315        let keep = budget / 2;
316        if keep >= 80 && output.len() > keep * 2 {
317            let head_end = output.floor_char_boundary(keep);
318            let tail_start = output.ceil_char_boundary(output.len() - keep);
319            let head = &output[..head_end];
320            let tail = &output[tail_start..];
321            return format!(
322                "{head}\n\n[diagnostic lines preserved]\n{diagnostics}\n\n[... output compacted ...]\n\n{tail}"
323            );
324        }
325    }
326    let keep = max_chars / 2;
327    let head_end = output.floor_char_boundary(keep);
328    let tail_start = output.ceil_char_boundary(output.len() - keep);
329    let head = &output[..head_end];
330    let tail = &output[tail_start..];
331    let snipped = output.len() - max_chars;
332    format!("{head}\n\n[... {snipped} characters snipped ...]\n\n{tail}")
333}
334
335fn format_compaction_messages(messages: &[serde_json::Value]) -> String {
336    messages
337        .iter()
338        .map(|msg| {
339            let role = msg
340                .get("role")
341                .and_then(|v| v.as_str())
342                .unwrap_or("user")
343                .to_uppercase();
344            let content = msg
345                .get("content")
346                .and_then(|v| v.as_str())
347                .unwrap_or_default();
348            format!("{role}: {content}")
349        })
350        .collect::<Vec<_>>()
351        .join("\n")
352}
353
354fn truncate_compaction_summary(
355    old_messages: &[serde_json::Value],
356    archived_count: usize,
357) -> String {
358    truncate_compaction_summary_with_context(old_messages, archived_count, false)
359}
360
361fn truncate_compaction_summary_with_context(
362    old_messages: &[serde_json::Value],
363    archived_count: usize,
364    is_llm_fallback: bool,
365) -> String {
366    let per_msg_limit = 500_usize;
367    let summary_parts: Vec<String> = old_messages
368        .iter()
369        .filter_map(|m| {
370            let role = m.get("role")?.as_str()?;
371            let content = m.get("content")?.as_str()?;
372            if content.is_empty() {
373                return None;
374            }
375            let truncated = if content.len() > per_msg_limit {
376                format!(
377                    "{}... [truncated from {} chars]",
378                    &content[..content.floor_char_boundary(per_msg_limit)],
379                    content.len()
380                )
381            } else {
382                content.to_string()
383            };
384            Some(format!("[{role}] {truncated}"))
385        })
386        .take(15)
387        .collect();
388    let header = if is_llm_fallback {
389        format!(
390            "[auto-compact fallback: LLM summarizer returned empty; {archived_count} older messages abbreviated to ~{per_msg_limit} chars each]"
391        )
392    } else {
393        format!("[auto-compacted {archived_count} older messages via truncate strategy]")
394    };
395    format!(
396        "{header}\n{}{}",
397        summary_parts.join("\n"),
398        if archived_count > 15 {
399            format!("\n... and {} more", archived_count - 15)
400        } else {
401            String::new()
402        }
403    )
404}
405
406fn compact_summary_text_from_value(value: &VmValue) -> Result<String, VmError> {
407    if let Some(map) = value.as_dict() {
408        if let Some(summary) = map.get("summary").or_else(|| map.get("text")) {
409            return Ok(summary.display());
410        }
411    }
412    match value {
413        VmValue::String(text) => Ok(text.to_string()),
414        VmValue::Nil => Ok(String::new()),
415        _ => serde_json::to_string_pretty(&vm_value_to_json(value))
416            .map_err(|e| VmError::Runtime(format!("custom compactor encode error: {e}"))),
417    }
418}
419
420async fn llm_compaction_summary(
421    old_messages: &[serde_json::Value],
422    archived_count: usize,
423    llm_opts: &crate::llm::api::LlmCallOptions,
424) -> Result<String, VmError> {
425    let mut compact_opts = llm_opts.clone();
426    let formatted = format_compaction_messages(old_messages);
427    compact_opts.system = None;
428    compact_opts.transcript_id = None;
429    compact_opts.transcript_summary = None;
430    compact_opts.transcript_metadata = None;
431    compact_opts.native_tools = None;
432    compact_opts.tool_choice = None;
433    compact_opts.response_format = None;
434    compact_opts.json_schema = None;
435    compact_opts.messages = vec![serde_json::json!({
436        "role": "user",
437        "content": format!(
438            "Summarize these archived conversation messages for a follow-on coding agent. Preserve goals, constraints, decisions, completed tool work, unresolved issues, and next actions. Output only the summary text.\n\nArchived message count: {archived_count}\n\nConversation:\n{formatted}"
439        ),
440    })];
441    let result = vm_call_llm_full(&compact_opts).await?;
442    let summary = result.text.trim();
443    if summary.is_empty() {
444        Ok(truncate_compaction_summary_with_context(
445            old_messages,
446            archived_count,
447            true,
448        ))
449    } else {
450        Ok(format!(
451            "[auto-compacted {archived_count} older messages]\n{summary}"
452        ))
453    }
454}
455
456async fn custom_compaction_summary(
457    old_messages: &[serde_json::Value],
458    archived_count: usize,
459    callback: &VmValue,
460) -> Result<String, VmError> {
461    let Some(VmValue::Closure(closure)) = Some(callback.clone()) else {
462        return Err(VmError::Runtime(
463            "compact_callback must be a closure when compact_strategy is 'custom'".to_string(),
464        ));
465    };
466    let mut vm = crate::vm::clone_async_builtin_child_vm().ok_or_else(|| {
467        VmError::Runtime(
468            "custom transcript compaction requires an async builtin VM context".to_string(),
469        )
470    })?;
471    let messages_vm = VmValue::List(Rc::new(
472        old_messages
473            .iter()
474            .map(crate::stdlib::json_to_vm_value)
475            .collect(),
476    ));
477    let result = vm.call_closure_pub(&closure, &[messages_vm], &[]).await;
478    let summary = compact_summary_text_from_value(&result?)?;
479    if summary.trim().is_empty() {
480        Ok(truncate_compaction_summary(old_messages, archived_count))
481    } else {
482        Ok(format!(
483            "[auto-compacted {archived_count} older messages]\n{summary}"
484        ))
485    }
486}
487
488/// Auto-compact a message list in place: summarize older messages into a
489/// note and keep the most recent `keep_last` messages.
490pub(crate) async fn auto_compact_messages(
491    messages: &mut Vec<serde_json::Value>,
492    config: &AutoCompactConfig,
493    llm_opts: Option<&crate::llm::api::LlmCallOptions>,
494) -> Result<Option<String>, VmError> {
495    if messages.len() <= config.keep_last {
496        return Ok(None);
497    }
498    let split_at = messages.len().saturating_sub(config.keep_last);
499    let old_messages: Vec<_> = messages.drain(..split_at).collect();
500    let archived_count = old_messages.len();
501    let summary = match config.compact_strategy {
502        CompactStrategy::Truncate => truncate_compaction_summary(&old_messages, archived_count),
503        CompactStrategy::Llm => {
504            llm_compaction_summary(
505                &old_messages,
506                archived_count,
507                llm_opts.ok_or_else(|| {
508                    VmError::Runtime(
509                        "LLM transcript compaction requires active LLM call options".to_string(),
510                    )
511                })?,
512            )
513            .await?
514        }
515        CompactStrategy::Custom => {
516            custom_compaction_summary(
517                &old_messages,
518                archived_count,
519                config.custom_compactor.as_ref().ok_or_else(|| {
520                    VmError::Runtime(
521                        "compact_callback is required when compact_strategy is 'custom'"
522                            .to_string(),
523                    )
524                })?,
525            )
526            .await?
527        }
528    };
529    messages.insert(
530        0,
531        serde_json::json!({
532            "role": "user",
533            "content": summary,
534        }),
535    );
536    Ok(Some(summary))
537}
538
539// ── Adaptive context assembly ─────────────────────────────────────────
540
541/// Snip an artifact's text to fit within a token budget.
542pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
543    let max_chars = max_tokens * 4;
544    if let Some(ref text) = artifact.text {
545        if text.len() > max_chars && max_chars >= 200 {
546            artifact.text = Some(microcompact_tool_output(text, max_chars));
547            artifact.estimated_tokens = Some(max_tokens);
548        }
549    }
550}
551
552/// Deduplicate artifacts by removing those with identical text content,
553/// keeping the one with higher priority.
554pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
555    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
556    artifacts.retain(|artifact| {
557        let text = artifact.text.as_deref().unwrap_or("");
558        if text.is_empty() {
559            return true;
560        }
561        // Simple hash for dedup
562        let hash = {
563            use std::hash::{Hash, Hasher};
564            let mut hasher = std::collections::hash_map::DefaultHasher::new();
565            text.hash(&mut hasher);
566            hasher.finish()
567        };
568        seen_hashes.insert(hash)
569    });
570}
571
572/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
573/// then delegate to the standard `select_artifacts`.
574pub fn select_artifacts_adaptive(
575    mut artifacts: Vec<ArtifactRecord>,
576    policy: &ContextPolicy,
577) -> Vec<ArtifactRecord> {
578    // Phase 1: deduplicate
579    dedup_artifacts(&mut artifacts);
580
581    // Phase 2: microcompact oversized artifacts relative to budget.
582    // Cap individual artifacts to a fraction of the total budget, but don't
583    // let the per-artifact cap exceed the total budget (avoid overrun).
584    if let Some(max_tokens) = policy.max_tokens {
585        let count = artifacts.len().max(1);
586        let per_artifact_budget = max_tokens / count;
587        // Floor of 500 tokens, but never more than total budget
588        let cap = per_artifact_budget.max(500).min(max_tokens);
589        for artifact in &mut artifacts {
590            let est = artifact.estimated_tokens.unwrap_or(0);
591            if est > cap * 2 {
592                microcompact_artifact(artifact, cap);
593            }
594        }
595    }
596
597    // Phase 3: standard selection with budget
598    select_artifacts(artifacts, policy)
599}
600
601// ── Per-agent policy with argument patterns ───────────────────────────
602
603/// Extended policy that supports argument-level constraints.
604#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
605#[serde(default)]
606pub struct ToolArgConstraint {
607    /// Tool name to constrain.
608    pub tool: String,
609    /// Glob patterns that the first string argument must match.
610    /// If empty, no argument constraint is applied.
611    pub arg_patterns: Vec<String>,
612}
613
614/// Check if a tool call satisfies argument constraints in the policy.
615pub fn enforce_tool_arg_constraints(
616    policy: &CapabilityPolicy,
617    tool_name: &str,
618    args: &serde_json::Value,
619) -> Result<(), VmError> {
620    for constraint in &policy.tool_arg_constraints {
621        if !glob_match(&constraint.tool, tool_name) {
622            continue;
623        }
624        if constraint.arg_patterns.is_empty() {
625            continue;
626        }
627        // Extract the first string-like argument for pattern matching
628        let first_arg = args
629            .as_object()
630            .and_then(|o| o.values().next())
631            .and_then(|v| v.as_str())
632            .or_else(|| args.as_str())
633            .unwrap_or("");
634        let matches = constraint
635            .arg_patterns
636            .iter()
637            .any(|pattern| glob_match(pattern, first_arg));
638        if !matches {
639            return reject_policy(format!(
640                "tool '{tool_name}' argument '{first_arg}' does not match allowed patterns: {:?}",
641                constraint.arg_patterns
642            ));
643        }
644    }
645    Ok(())
646}
647
648fn normalize_artifact_kind(kind: &str) -> String {
649    match kind {
650        "resource"
651        | "workspace_file"
652        | "editor_selection"
653        | "workspace_snapshot"
654        | "transcript_summary"
655        | "summary"
656        | "plan"
657        | "diff"
658        | "git_diff"
659        | "patch"
660        | "patch_set"
661        | "patch_proposal"
662        | "diff_review"
663        | "review_decision"
664        | "verification_bundle"
665        | "apply_intent"
666        | "verification_result"
667        | "test_result"
668        | "command_result"
669        | "provider_payload"
670        | "worker_result"
671        | "worker_notification"
672        | "artifact" => kind.to_string(),
673        "file" => "workspace_file".to_string(),
674        "transcript" => "transcript_summary".to_string(),
675        "verification" => "verification_result".to_string(),
676        "test" => "test_result".to_string(),
677        other if other.trim().is_empty() => "artifact".to_string(),
678        other => other.to_string(),
679    }
680}
681
682fn default_artifact_priority(kind: &str) -> i64 {
683    match kind {
684        "verification_result" | "test_result" => 100,
685        "verification_bundle" => 95,
686        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
687        | "review_decision" | "apply_intent" => 90,
688        "plan" => 80,
689        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
690        "summary" | "transcript_summary" => 60,
691        "command_result" => 50,
692        _ => 40,
693    }
694}
695
696fn freshness_rank(value: Option<&str>) -> i64 {
697    match value.unwrap_or_default() {
698        "fresh" | "live" => 3,
699        "recent" => 2,
700        "stale" => 0,
701        _ => 1,
702    }
703}
704
705#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
706#[serde(default)]
707pub struct ToolRuntimePolicyMetadata {
708    pub capabilities: BTreeMap<String, Vec<String>>,
709    pub side_effect_level: Option<String>,
710    pub path_params: Vec<String>,
711    pub mutation_classification: Option<String>,
712}
713
714#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
715#[serde(default)]
716pub struct CapabilityPolicy {
717    pub tools: Vec<String>,
718    pub capabilities: BTreeMap<String, Vec<String>>,
719    pub workspace_roots: Vec<String>,
720    pub side_effect_level: Option<String>,
721    pub recursion_limit: Option<usize>,
722    /// Argument-level constraints for specific tools.
723    #[serde(default)]
724    pub tool_arg_constraints: Vec<ToolArgConstraint>,
725    #[serde(default)]
726    pub tool_metadata: BTreeMap<String, ToolRuntimePolicyMetadata>,
727}
728
729impl CapabilityPolicy {
730    pub fn intersect(&self, requested: &CapabilityPolicy) -> Result<CapabilityPolicy, String> {
731        let side_effect_level = match (&self.side_effect_level, &requested.side_effect_level) {
732            (Some(a), Some(b)) => Some(min_side_effect(a, b).to_string()),
733            (Some(a), None) => Some(a.clone()),
734            (None, Some(b)) => Some(b.clone()),
735            (None, None) => None,
736        };
737
738        if !self.tools.is_empty() {
739            let denied: Vec<String> = requested
740                .tools
741                .iter()
742                .filter(|tool| !self.tools.contains(*tool))
743                .cloned()
744                .collect();
745            if !denied.is_empty() {
746                return Err(format!(
747                    "requested tools exceed host ceiling: {}",
748                    denied.join(", ")
749                ));
750            }
751        }
752
753        for (capability, requested_ops) in &requested.capabilities {
754            if let Some(allowed_ops) = self.capabilities.get(capability) {
755                let denied: Vec<String> = requested_ops
756                    .iter()
757                    .filter(|op| !allowed_ops.contains(*op))
758                    .cloned()
759                    .collect();
760                if !denied.is_empty() {
761                    return Err(format!(
762                        "requested capability operations exceed host ceiling: {}.{}",
763                        capability,
764                        denied.join(",")
765                    ));
766                }
767            } else if !self.capabilities.is_empty() {
768                return Err(format!(
769                    "requested capability exceeds host ceiling: {capability}"
770                ));
771            }
772        }
773
774        let tools = if self.tools.is_empty() {
775            requested.tools.clone()
776        } else if requested.tools.is_empty() {
777            self.tools.clone()
778        } else {
779            requested
780                .tools
781                .iter()
782                .filter(|tool| self.tools.contains(*tool))
783                .cloned()
784                .collect()
785        };
786
787        let capabilities = if self.capabilities.is_empty() {
788            requested.capabilities.clone()
789        } else if requested.capabilities.is_empty() {
790            self.capabilities.clone()
791        } else {
792            requested
793                .capabilities
794                .iter()
795                .filter_map(|(capability, requested_ops)| {
796                    self.capabilities.get(capability).map(|allowed_ops| {
797                        (
798                            capability.clone(),
799                            requested_ops
800                                .iter()
801                                .filter(|op| allowed_ops.contains(*op))
802                                .cloned()
803                                .collect::<Vec<_>>(),
804                        )
805                    })
806                })
807                .collect()
808        };
809
810        let workspace_roots = if self.workspace_roots.is_empty() {
811            requested.workspace_roots.clone()
812        } else if requested.workspace_roots.is_empty() {
813            self.workspace_roots.clone()
814        } else {
815            requested
816                .workspace_roots
817                .iter()
818                .filter(|root| self.workspace_roots.contains(*root))
819                .cloned()
820                .collect()
821        };
822
823        let recursion_limit = match (self.recursion_limit, requested.recursion_limit) {
824            (Some(a), Some(b)) => Some(a.min(b)),
825            (Some(a), None) => Some(a),
826            (None, Some(b)) => Some(b),
827            (None, None) => None,
828        };
829
830        // Merge arg constraints from both sides
831        let mut tool_arg_constraints = self.tool_arg_constraints.clone();
832        tool_arg_constraints.extend(requested.tool_arg_constraints.clone());
833
834        let tool_metadata = tools
835            .iter()
836            .filter_map(|tool| {
837                requested
838                    .tool_metadata
839                    .get(tool)
840                    .or_else(|| self.tool_metadata.get(tool))
841                    .cloned()
842                    .map(|metadata| (tool.clone(), metadata))
843            })
844            .collect();
845
846        Ok(CapabilityPolicy {
847            tools,
848            capabilities,
849            workspace_roots,
850            side_effect_level,
851            recursion_limit,
852            tool_arg_constraints,
853            tool_metadata,
854        })
855    }
856}
857
858fn min_side_effect<'a>(a: &'a str, b: &'a str) -> &'a str {
859    fn rank(v: &str) -> usize {
860        match v {
861            "none" => 0,
862            "read_only" => 1,
863            "workspace_write" => 2,
864            "process_exec" => 3,
865            "network" => 4,
866            _ => 5,
867        }
868    }
869    if rank(a) <= rank(b) {
870        a
871    } else {
872        b
873    }
874}
875
876#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
877#[serde(default)]
878pub struct ModelPolicy {
879    pub provider: Option<String>,
880    pub model: Option<String>,
881    pub model_tier: Option<String>,
882    pub temperature: Option<f64>,
883    pub max_tokens: Option<i64>,
884}
885
886#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
887#[serde(default)]
888pub struct TranscriptPolicy {
889    pub mode: Option<String>,
890    pub visibility: Option<String>,
891    pub summarize: bool,
892    pub compact: bool,
893    pub keep_last: Option<usize>,
894}
895
896#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
897#[serde(default)]
898pub struct ContextPolicy {
899    pub max_artifacts: Option<usize>,
900    pub max_tokens: Option<usize>,
901    pub reserve_tokens: Option<usize>,
902    pub include_kinds: Vec<String>,
903    pub exclude_kinds: Vec<String>,
904    pub prioritize_kinds: Vec<String>,
905    pub pinned_ids: Vec<String>,
906    pub include_stages: Vec<String>,
907    pub prefer_recent: bool,
908    pub prefer_fresh: bool,
909    pub render: Option<String>,
910}
911
912#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
913#[serde(default)]
914pub struct RetryPolicy {
915    pub max_attempts: usize,
916    pub verify: bool,
917    pub repair: bool,
918}
919
920#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
921#[serde(default)]
922pub struct StageContract {
923    pub input_kinds: Vec<String>,
924    pub output_kinds: Vec<String>,
925    pub min_inputs: Option<usize>,
926    pub max_inputs: Option<usize>,
927    pub require_transcript: bool,
928    pub schema: Option<serde_json::Value>,
929}
930
931#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
932#[serde(default)]
933pub struct BranchSemantics {
934    pub success: Option<String>,
935    pub failure: Option<String>,
936    pub verify_pass: Option<String>,
937    pub verify_fail: Option<String>,
938    pub condition_true: Option<String>,
939    pub condition_false: Option<String>,
940    pub loop_continue: Option<String>,
941    pub loop_exit: Option<String>,
942    pub escalation: Option<String>,
943}
944
945#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
946#[serde(default)]
947pub struct MapPolicy {
948    pub items: Vec<serde_json::Value>,
949    pub item_artifact_kind: Option<String>,
950    pub output_kind: Option<String>,
951    pub max_items: Option<usize>,
952}
953
954#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
955#[serde(default)]
956pub struct JoinPolicy {
957    pub strategy: String,
958    pub require_all_inputs: bool,
959    pub min_completed: Option<usize>,
960}
961
962#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
963#[serde(default)]
964pub struct ReducePolicy {
965    pub strategy: String,
966    pub separator: Option<String>,
967    pub output_kind: Option<String>,
968}
969
970#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
971#[serde(default)]
972pub struct EscalationPolicy {
973    pub level: Option<String>,
974    pub queue: Option<String>,
975    pub reason: Option<String>,
976}
977
978#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
979#[serde(default)]
980pub struct ArtifactRecord {
981    #[serde(rename = "_type")]
982    pub type_name: String,
983    pub id: String,
984    pub kind: String,
985    pub title: Option<String>,
986    pub text: Option<String>,
987    pub data: Option<serde_json::Value>,
988    pub source: Option<String>,
989    pub created_at: String,
990    pub freshness: Option<String>,
991    pub priority: Option<i64>,
992    pub lineage: Vec<String>,
993    pub relevance: Option<f64>,
994    pub estimated_tokens: Option<usize>,
995    pub stage: Option<String>,
996    pub metadata: BTreeMap<String, serde_json::Value>,
997}
998
999impl ArtifactRecord {
1000    pub fn normalize(mut self) -> Self {
1001        if self.type_name.is_empty() {
1002            self.type_name = "artifact".to_string();
1003        }
1004        if self.id.is_empty() {
1005            self.id = new_id("artifact");
1006        }
1007        if self.created_at.is_empty() {
1008            self.created_at = now_rfc3339();
1009        }
1010        if self.kind.is_empty() {
1011            self.kind = "artifact".to_string();
1012        }
1013        self.kind = normalize_artifact_kind(&self.kind);
1014        if self.estimated_tokens.is_none() {
1015            self.estimated_tokens = self
1016                .text
1017                .as_ref()
1018                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
1019        }
1020        if self.priority.is_none() {
1021            self.priority = Some(default_artifact_priority(&self.kind));
1022        }
1023        self
1024    }
1025}
1026
1027#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1028#[serde(default)]
1029pub struct WorkflowNode {
1030    pub id: Option<String>,
1031    pub kind: String,
1032    pub mode: Option<String>,
1033    pub prompt: Option<String>,
1034    pub system: Option<String>,
1035    pub task_label: Option<String>,
1036    pub done_sentinel: Option<String>,
1037    pub tools: serde_json::Value,
1038    pub model_policy: ModelPolicy,
1039    pub transcript_policy: TranscriptPolicy,
1040    pub context_policy: ContextPolicy,
1041    pub retry_policy: RetryPolicy,
1042    pub capability_policy: CapabilityPolicy,
1043    pub input_contract: StageContract,
1044    pub output_contract: StageContract,
1045    pub branch_semantics: BranchSemantics,
1046    pub map_policy: MapPolicy,
1047    pub join_policy: JoinPolicy,
1048    pub reduce_policy: ReducePolicy,
1049    pub escalation_policy: EscalationPolicy,
1050    pub verify: Option<serde_json::Value>,
1051    pub metadata: BTreeMap<String, serde_json::Value>,
1052}
1053
1054pub fn workflow_tool_names(value: &serde_json::Value) -> Vec<String> {
1055    match value {
1056        serde_json::Value::Null => Vec::new(),
1057        serde_json::Value::Array(items) => items
1058            .iter()
1059            .filter_map(|item| match item {
1060                serde_json::Value::Object(map) => map
1061                    .get("name")
1062                    .and_then(|value| value.as_str())
1063                    .filter(|name| !name.is_empty())
1064                    .map(|name| name.to_string()),
1065                _ => None,
1066            })
1067            .collect(),
1068        serde_json::Value::Object(map) => {
1069            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1070                return map
1071                    .get("tools")
1072                    .map(workflow_tool_names)
1073                    .unwrap_or_default();
1074            }
1075            map.get("name")
1076                .and_then(|value| value.as_str())
1077                .filter(|name| !name.is_empty())
1078                .map(|name| vec![name.to_string()])
1079                .unwrap_or_default()
1080        }
1081        _ => Vec::new(),
1082    }
1083}
1084
1085fn max_side_effect_level(levels: impl Iterator<Item = String>) -> Option<String> {
1086    fn rank(v: &str) -> usize {
1087        match v {
1088            "none" => 0,
1089            "read_only" => 1,
1090            "workspace_write" => 2,
1091            "process_exec" => 3,
1092            "network" => 4,
1093            _ => 5,
1094        }
1095    }
1096    levels.max_by_key(|level| rank(level))
1097}
1098
1099fn parse_tool_runtime_policy(
1100    map: &serde_json::Map<String, serde_json::Value>,
1101) -> ToolRuntimePolicyMetadata {
1102    let Some(policy) = map.get("policy").and_then(|value| value.as_object()) else {
1103        return ToolRuntimePolicyMetadata::default();
1104    };
1105
1106    let capabilities = policy
1107        .get("capabilities")
1108        .and_then(|value| value.as_object())
1109        .map(|caps| {
1110            caps.iter()
1111                .map(|(capability, ops)| {
1112                    let values = ops
1113                        .as_array()
1114                        .map(|items| {
1115                            items
1116                                .iter()
1117                                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1118                                .collect::<Vec<_>>()
1119                        })
1120                        .unwrap_or_default();
1121                    (capability.clone(), values)
1122                })
1123                .collect::<BTreeMap<_, _>>()
1124        })
1125        .unwrap_or_default();
1126
1127    let path_params = policy
1128        .get("path_params")
1129        .and_then(|value| value.as_array())
1130        .map(|items| {
1131            items
1132                .iter()
1133                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1134                .collect::<Vec<_>>()
1135        })
1136        .unwrap_or_default();
1137
1138    ToolRuntimePolicyMetadata {
1139        capabilities,
1140        side_effect_level: policy
1141            .get("side_effect_level")
1142            .and_then(|value| value.as_str())
1143            .map(|s| s.to_string()),
1144        path_params,
1145        mutation_classification: policy
1146            .get("mutation_classification")
1147            .and_then(|value| value.as_str())
1148            .map(|s| s.to_string()),
1149    }
1150}
1151
1152pub fn workflow_tool_metadata(
1153    value: &serde_json::Value,
1154) -> BTreeMap<String, ToolRuntimePolicyMetadata> {
1155    match value {
1156        serde_json::Value::Null => BTreeMap::new(),
1157        serde_json::Value::Array(items) => items
1158            .iter()
1159            .filter_map(|item| match item {
1160                serde_json::Value::Object(map) => map
1161                    .get("name")
1162                    .and_then(|value| value.as_str())
1163                    .filter(|name| !name.is_empty())
1164                    .map(|name| (name.to_string(), parse_tool_runtime_policy(map))),
1165                _ => None,
1166            })
1167            .collect(),
1168        serde_json::Value::Object(map) => {
1169            if map.get("_type").and_then(|value| value.as_str()) == Some("tool_registry") {
1170                return map
1171                    .get("tools")
1172                    .map(workflow_tool_metadata)
1173                    .unwrap_or_default();
1174            }
1175            map.get("name")
1176                .and_then(|value| value.as_str())
1177                .filter(|name| !name.is_empty())
1178                .map(|name| {
1179                    let mut metadata = BTreeMap::new();
1180                    metadata.insert(name.to_string(), parse_tool_runtime_policy(map));
1181                    metadata
1182                })
1183                .unwrap_or_default()
1184        }
1185        _ => BTreeMap::new(),
1186    }
1187}
1188
1189pub fn workflow_tool_policy_from_tools(value: &serde_json::Value) -> CapabilityPolicy {
1190    let tools = workflow_tool_names(value);
1191    let tool_metadata = workflow_tool_metadata(value);
1192    let mut capabilities: BTreeMap<String, Vec<String>> = BTreeMap::new();
1193    for metadata in tool_metadata.values() {
1194        for (capability, ops) in &metadata.capabilities {
1195            let entry = capabilities.entry(capability.clone()).or_default();
1196            for op in ops {
1197                if !entry.contains(op) {
1198                    entry.push(op.clone());
1199                }
1200            }
1201            entry.sort();
1202        }
1203    }
1204    let side_effect_level = max_side_effect_level(
1205        tool_metadata
1206            .values()
1207            .filter_map(|metadata| metadata.side_effect_level.clone()),
1208    );
1209    CapabilityPolicy {
1210        tools,
1211        capabilities,
1212        workspace_roots: Vec::new(),
1213        side_effect_level,
1214        recursion_limit: None,
1215        tool_arg_constraints: Vec::new(),
1216        tool_metadata,
1217    }
1218}
1219
1220#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1221#[serde(default)]
1222pub struct WorkflowEdge {
1223    pub from: String,
1224    pub to: String,
1225    pub branch: Option<String>,
1226    pub label: Option<String>,
1227}
1228
1229#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1230#[serde(default)]
1231pub struct WorkflowGraph {
1232    #[serde(rename = "_type")]
1233    pub type_name: String,
1234    pub id: String,
1235    pub name: Option<String>,
1236    pub version: usize,
1237    pub entry: String,
1238    pub nodes: BTreeMap<String, WorkflowNode>,
1239    pub edges: Vec<WorkflowEdge>,
1240    pub capability_policy: CapabilityPolicy,
1241    pub metadata: BTreeMap<String, serde_json::Value>,
1242    pub audit_log: Vec<WorkflowAuditEntry>,
1243}
1244
1245#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1246#[serde(default)]
1247pub struct WorkflowAuditEntry {
1248    pub id: String,
1249    pub op: String,
1250    pub node_id: Option<String>,
1251    pub timestamp: String,
1252    pub reason: Option<String>,
1253    pub metadata: BTreeMap<String, serde_json::Value>,
1254}
1255
1256#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1257#[serde(default)]
1258pub struct LlmUsageRecord {
1259    pub input_tokens: i64,
1260    pub output_tokens: i64,
1261    pub total_duration_ms: i64,
1262    pub call_count: i64,
1263    pub total_cost: f64,
1264    pub models: Vec<String>,
1265}
1266
1267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1268#[serde(default)]
1269pub struct RunStageRecord {
1270    pub id: String,
1271    pub node_id: String,
1272    pub kind: String,
1273    pub status: String,
1274    pub outcome: String,
1275    pub branch: Option<String>,
1276    pub started_at: String,
1277    pub finished_at: Option<String>,
1278    pub visible_text: Option<String>,
1279    pub private_reasoning: Option<String>,
1280    pub transcript: Option<serde_json::Value>,
1281    pub verification: Option<serde_json::Value>,
1282    pub usage: Option<LlmUsageRecord>,
1283    pub artifacts: Vec<ArtifactRecord>,
1284    pub consumed_artifact_ids: Vec<String>,
1285    pub produced_artifact_ids: Vec<String>,
1286    pub attempts: Vec<RunStageAttemptRecord>,
1287    pub metadata: BTreeMap<String, serde_json::Value>,
1288}
1289
1290#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1291#[serde(default)]
1292pub struct RunStageAttemptRecord {
1293    pub attempt: usize,
1294    pub status: String,
1295    pub outcome: String,
1296    pub branch: Option<String>,
1297    pub error: Option<String>,
1298    pub verification: Option<serde_json::Value>,
1299    pub started_at: String,
1300    pub finished_at: Option<String>,
1301}
1302
1303#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1304#[serde(default)]
1305pub struct RunTransitionRecord {
1306    pub id: String,
1307    pub from_stage_id: Option<String>,
1308    pub from_node_id: Option<String>,
1309    pub to_node_id: String,
1310    pub branch: Option<String>,
1311    pub timestamp: String,
1312    pub consumed_artifact_ids: Vec<String>,
1313    pub produced_artifact_ids: Vec<String>,
1314}
1315
1316#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1317#[serde(default)]
1318pub struct RunCheckpointRecord {
1319    pub id: String,
1320    pub ready_nodes: Vec<String>,
1321    pub completed_nodes: Vec<String>,
1322    pub last_stage_id: Option<String>,
1323    pub persisted_at: String,
1324    pub reason: String,
1325}
1326
1327#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1328#[serde(default)]
1329pub struct ReplayFixture {
1330    #[serde(rename = "_type")]
1331    pub type_name: String,
1332    pub id: String,
1333    pub source_run_id: String,
1334    pub workflow_id: String,
1335    pub workflow_name: Option<String>,
1336    pub created_at: String,
1337    pub expected_status: String,
1338    pub stage_assertions: Vec<ReplayStageAssertion>,
1339}
1340
1341#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1342#[serde(default)]
1343pub struct ReplayStageAssertion {
1344    pub node_id: String,
1345    pub expected_status: String,
1346    pub expected_outcome: String,
1347    pub expected_branch: Option<String>,
1348    pub required_artifact_kinds: Vec<String>,
1349    pub visible_text_contains: Option<String>,
1350}
1351
1352#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1353#[serde(default)]
1354pub struct ReplayEvalReport {
1355    pub pass: bool,
1356    pub failures: Vec<String>,
1357    pub stage_count: usize,
1358}
1359
1360#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1361#[serde(default)]
1362pub struct ReplayEvalCaseReport {
1363    pub run_id: String,
1364    pub workflow_id: String,
1365    pub label: Option<String>,
1366    pub pass: bool,
1367    pub failures: Vec<String>,
1368    pub stage_count: usize,
1369    pub source_path: Option<String>,
1370    pub comparison: Option<RunDiffReport>,
1371}
1372
1373#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1374#[serde(default)]
1375pub struct ReplayEvalSuiteReport {
1376    pub pass: bool,
1377    pub total: usize,
1378    pub passed: usize,
1379    pub failed: usize,
1380    pub cases: Vec<ReplayEvalCaseReport>,
1381}
1382
1383#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1384#[serde(default)]
1385pub struct RunStageDiffRecord {
1386    pub node_id: String,
1387    pub change: String,
1388    pub details: Vec<String>,
1389}
1390
1391#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1392#[serde(default)]
1393pub struct RunDiffReport {
1394    pub left_run_id: String,
1395    pub right_run_id: String,
1396    pub identical: bool,
1397    pub status_changed: bool,
1398    pub left_status: String,
1399    pub right_status: String,
1400    pub stage_diffs: Vec<RunStageDiffRecord>,
1401    pub transition_count_delta: isize,
1402    pub artifact_count_delta: isize,
1403    pub checkpoint_count_delta: isize,
1404}
1405
1406#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1407#[serde(default)]
1408pub struct EvalSuiteManifest {
1409    #[serde(rename = "_type")]
1410    pub type_name: String,
1411    pub id: String,
1412    pub name: Option<String>,
1413    pub base_dir: Option<String>,
1414    pub cases: Vec<EvalSuiteCase>,
1415}
1416
1417#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1418#[serde(default)]
1419pub struct EvalSuiteCase {
1420    pub label: Option<String>,
1421    pub run_path: String,
1422    pub fixture_path: Option<String>,
1423    pub compare_to: Option<String>,
1424}
1425
1426#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
1427#[serde(default)]
1428pub struct RunRecord {
1429    #[serde(rename = "_type")]
1430    pub type_name: String,
1431    pub id: String,
1432    pub workflow_id: String,
1433    pub workflow_name: Option<String>,
1434    pub task: String,
1435    pub status: String,
1436    pub started_at: String,
1437    pub finished_at: Option<String>,
1438    pub parent_run_id: Option<String>,
1439    pub root_run_id: Option<String>,
1440    pub stages: Vec<RunStageRecord>,
1441    pub transitions: Vec<RunTransitionRecord>,
1442    pub checkpoints: Vec<RunCheckpointRecord>,
1443    pub pending_nodes: Vec<String>,
1444    pub completed_nodes: Vec<String>,
1445    pub child_runs: Vec<RunChildRecord>,
1446    pub artifacts: Vec<ArtifactRecord>,
1447    pub policy: CapabilityPolicy,
1448    pub execution: Option<RunExecutionRecord>,
1449    pub transcript: Option<serde_json::Value>,
1450    pub usage: Option<LlmUsageRecord>,
1451    pub replay_fixture: Option<ReplayFixture>,
1452    pub trace_spans: Vec<RunTraceSpanRecord>,
1453    pub metadata: BTreeMap<String, serde_json::Value>,
1454    pub persisted_path: Option<String>,
1455}
1456
1457#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1458#[serde(default)]
1459pub struct RunTraceSpanRecord {
1460    pub span_id: u64,
1461    pub parent_id: Option<u64>,
1462    pub kind: String,
1463    pub name: String,
1464    pub start_ms: u64,
1465    pub duration_ms: u64,
1466    pub metadata: BTreeMap<String, serde_json::Value>,
1467}
1468
1469#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1470#[serde(default)]
1471pub struct RunChildRecord {
1472    pub worker_id: String,
1473    pub worker_name: String,
1474    pub parent_stage_id: Option<String>,
1475    pub session_id: Option<String>,
1476    pub parent_session_id: Option<String>,
1477    pub mutation_scope: Option<String>,
1478    pub approval_mode: Option<String>,
1479    pub task: String,
1480    pub status: String,
1481    pub started_at: String,
1482    pub finished_at: Option<String>,
1483    pub run_id: Option<String>,
1484    pub run_path: Option<String>,
1485    pub snapshot_path: Option<String>,
1486    pub execution: Option<RunExecutionRecord>,
1487}
1488
1489#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1490#[serde(default)]
1491pub struct RunExecutionRecord {
1492    pub cwd: Option<String>,
1493    pub source_dir: Option<String>,
1494    pub env: BTreeMap<String, String>,
1495    pub adapter: Option<String>,
1496    pub repo_path: Option<String>,
1497    pub worktree_path: Option<String>,
1498    pub branch: Option<String>,
1499    pub base_ref: Option<String>,
1500    pub cleanup: Option<String>,
1501}
1502
1503#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
1504#[serde(default)]
1505pub struct WorkflowValidationReport {
1506    pub valid: bool,
1507    pub errors: Vec<String>,
1508    pub warnings: Vec<String>,
1509    pub reachable_nodes: Vec<String>,
1510}
1511
1512fn parse_json_payload<T: for<'de> Deserialize<'de>>(
1513    json: serde_json::Value,
1514    label: &str,
1515) -> Result<T, VmError> {
1516    let payload = json.to_string();
1517    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1518    let mut tracker = serde_path_to_error::Track::new();
1519    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1520    T::deserialize(path_deserializer).map_err(|error| {
1521        let snippet = if payload.len() > 600 {
1522            format!("{}...", &payload[..600])
1523        } else {
1524            payload.clone()
1525        };
1526        VmError::Runtime(format!(
1527            "{label} parse error at {}: {} | payload={}",
1528            tracker.path(),
1529            error,
1530            snippet
1531        ))
1532    })
1533}
1534
1535fn parse_json_value<T: for<'de> Deserialize<'de>>(value: &VmValue) -> Result<T, VmError> {
1536    parse_json_payload(vm_value_to_json(value), "orchestration")
1537}
1538
1539pub fn parse_workflow_node_value(value: &VmValue, label: &str) -> Result<WorkflowNode, VmError> {
1540    parse_json_payload(vm_value_to_json(value), label)
1541}
1542
1543pub fn parse_workflow_node_json(
1544    json: serde_json::Value,
1545    label: &str,
1546) -> Result<WorkflowNode, VmError> {
1547    parse_json_payload(json, label)
1548}
1549
1550pub fn parse_workflow_edge_json(
1551    json: serde_json::Value,
1552    label: &str,
1553) -> Result<WorkflowEdge, VmError> {
1554    parse_json_payload(json, label)
1555}
1556
1557pub fn normalize_workflow_value(value: &VmValue) -> Result<WorkflowGraph, VmError> {
1558    let mut graph: WorkflowGraph = parse_json_value(value)?;
1559    let as_dict = value.as_dict().cloned().unwrap_or_default();
1560
1561    if graph.nodes.is_empty() {
1562        for key in ["act", "verify", "repair"] {
1563            if let Some(node_value) = as_dict.get(key) {
1564                let mut node: WorkflowNode = parse_json_value(node_value)?;
1565                let raw_node = node_value.as_dict().cloned().unwrap_or_default();
1566                node.id = Some(key.to_string());
1567                if node.kind.is_empty() {
1568                    node.kind = if key == "verify" {
1569                        "verify".to_string()
1570                    } else {
1571                        "stage".to_string()
1572                    };
1573                }
1574                if node.model_policy.provider.is_none() {
1575                    node.model_policy.provider = as_dict
1576                        .get("provider")
1577                        .map(|value| value.display())
1578                        .filter(|value| !value.is_empty());
1579                }
1580                if node.model_policy.model.is_none() {
1581                    node.model_policy.model = as_dict
1582                        .get("model")
1583                        .map(|value| value.display())
1584                        .filter(|value| !value.is_empty());
1585                }
1586                if node.model_policy.model_tier.is_none() {
1587                    node.model_policy.model_tier = as_dict
1588                        .get("model_tier")
1589                        .or_else(|| as_dict.get("tier"))
1590                        .map(|value| value.display())
1591                        .filter(|value| !value.is_empty());
1592                }
1593                if node.model_policy.temperature.is_none() {
1594                    node.model_policy.temperature = as_dict.get("temperature").and_then(|value| {
1595                        if let VmValue::Float(number) = value {
1596                            Some(*number)
1597                        } else {
1598                            value.as_int().map(|number| number as f64)
1599                        }
1600                    });
1601                }
1602                if node.model_policy.max_tokens.is_none() {
1603                    node.model_policy.max_tokens =
1604                        as_dict.get("max_tokens").and_then(|value| value.as_int());
1605                }
1606                if node.mode.is_none() {
1607                    node.mode = as_dict
1608                        .get("mode")
1609                        .map(|value| value.display())
1610                        .filter(|value| !value.is_empty());
1611                }
1612                if node.done_sentinel.is_none() {
1613                    node.done_sentinel = as_dict
1614                        .get("done_sentinel")
1615                        .map(|value| value.display())
1616                        .filter(|value| !value.is_empty());
1617                }
1618                if key == "verify"
1619                    && node.verify.is_none()
1620                    && (raw_node.contains_key("assert_text")
1621                        || raw_node.contains_key("command")
1622                        || raw_node.contains_key("expect_status")
1623                        || raw_node.contains_key("expect_text"))
1624                {
1625                    node.verify = Some(serde_json::json!({
1626                        "assert_text": raw_node.get("assert_text").map(vm_value_to_json),
1627                        "command": raw_node.get("command").map(vm_value_to_json),
1628                        "expect_status": raw_node.get("expect_status").map(vm_value_to_json),
1629                        "expect_text": raw_node.get("expect_text").map(vm_value_to_json),
1630                    }));
1631                }
1632                graph.nodes.insert(key.to_string(), node);
1633            }
1634        }
1635        if graph.entry.is_empty() && graph.nodes.contains_key("act") {
1636            graph.entry = "act".to_string();
1637        }
1638        if graph.edges.is_empty() && graph.nodes.contains_key("act") {
1639            if graph.nodes.contains_key("verify") {
1640                graph.edges.push(WorkflowEdge {
1641                    from: "act".to_string(),
1642                    to: "verify".to_string(),
1643                    branch: None,
1644                    label: None,
1645                });
1646            }
1647            if graph.nodes.contains_key("repair") {
1648                graph.edges.push(WorkflowEdge {
1649                    from: "verify".to_string(),
1650                    to: "repair".to_string(),
1651                    branch: Some("failed".to_string()),
1652                    label: None,
1653                });
1654                graph.edges.push(WorkflowEdge {
1655                    from: "repair".to_string(),
1656                    to: "verify".to_string(),
1657                    branch: Some("retry".to_string()),
1658                    label: None,
1659                });
1660            }
1661        }
1662    }
1663
1664    if graph.type_name.is_empty() {
1665        graph.type_name = "workflow_graph".to_string();
1666    }
1667    if graph.id.is_empty() {
1668        graph.id = new_id("workflow");
1669    }
1670    if graph.version == 0 {
1671        graph.version = 1;
1672    }
1673    if graph.entry.is_empty() {
1674        graph.entry = graph
1675            .nodes
1676            .keys()
1677            .next()
1678            .cloned()
1679            .unwrap_or_else(|| "act".to_string());
1680    }
1681    for (node_id, node) in &mut graph.nodes {
1682        if node.id.is_none() {
1683            node.id = Some(node_id.clone());
1684        }
1685        if node.kind.is_empty() {
1686            node.kind = "stage".to_string();
1687        }
1688        if node.join_policy.strategy.is_empty() {
1689            node.join_policy.strategy = "all".to_string();
1690        }
1691        if node.reduce_policy.strategy.is_empty() {
1692            node.reduce_policy.strategy = "concat".to_string();
1693        }
1694        if node.output_contract.output_kinds.is_empty() {
1695            node.output_contract.output_kinds = vec![match node.kind.as_str() {
1696                "verify" => "verification_result".to_string(),
1697                "reduce" => node
1698                    .reduce_policy
1699                    .output_kind
1700                    .clone()
1701                    .unwrap_or_else(|| "summary".to_string()),
1702                "map" => node
1703                    .map_policy
1704                    .output_kind
1705                    .clone()
1706                    .unwrap_or_else(|| "artifact".to_string()),
1707                "escalation" => "plan".to_string(),
1708                _ => "artifact".to_string(),
1709            }];
1710        }
1711        if node.retry_policy.max_attempts == 0 {
1712            node.retry_policy.max_attempts = 1;
1713        }
1714    }
1715    Ok(graph)
1716}
1717
1718pub fn validate_workflow(
1719    graph: &WorkflowGraph,
1720    ceiling: Option<&CapabilityPolicy>,
1721) -> WorkflowValidationReport {
1722    let mut errors = Vec::new();
1723    let mut warnings = Vec::new();
1724
1725    if !graph.nodes.contains_key(&graph.entry) {
1726        errors.push(format!("entry node does not exist: {}", graph.entry));
1727    }
1728
1729    let node_ids: BTreeSet<String> = graph.nodes.keys().cloned().collect();
1730    for edge in &graph.edges {
1731        if !node_ids.contains(&edge.from) {
1732            errors.push(format!("edge.from references unknown node: {}", edge.from));
1733        }
1734        if !node_ids.contains(&edge.to) {
1735            errors.push(format!("edge.to references unknown node: {}", edge.to));
1736        }
1737    }
1738
1739    let reachable_nodes = reachable_nodes(graph);
1740    for node_id in &node_ids {
1741        if !reachable_nodes.contains(node_id) {
1742            warnings.push(format!("node is unreachable: {node_id}"));
1743        }
1744    }
1745
1746    for (node_id, node) in &graph.nodes {
1747        let incoming = graph
1748            .edges
1749            .iter()
1750            .filter(|edge| edge.to == *node_id)
1751            .count();
1752        let outgoing: Vec<&WorkflowEdge> = graph
1753            .edges
1754            .iter()
1755            .filter(|edge| edge.from == *node_id)
1756            .collect();
1757        if let Some(min_inputs) = node.input_contract.min_inputs {
1758            if let Some(max_inputs) = node.input_contract.max_inputs {
1759                if min_inputs > max_inputs {
1760                    errors.push(format!(
1761                        "node {node_id}: input contract min_inputs exceeds max_inputs"
1762                    ));
1763                }
1764            }
1765        }
1766        match node.kind.as_str() {
1767            "condition" => {
1768                let has_true = outgoing
1769                    .iter()
1770                    .any(|edge| edge.branch.as_deref() == Some("true"));
1771                let has_false = outgoing
1772                    .iter()
1773                    .any(|edge| edge.branch.as_deref() == Some("false"));
1774                if !has_true || !has_false {
1775                    errors.push(format!(
1776                        "node {node_id}: condition nodes require both 'true' and 'false' branch edges"
1777                    ));
1778                }
1779            }
1780            "fork" => {
1781                if outgoing.len() < 2 {
1782                    errors.push(format!(
1783                        "node {node_id}: fork nodes require at least two outgoing edges"
1784                    ));
1785                }
1786            }
1787            "join" => {
1788                if incoming < 2 {
1789                    warnings.push(format!(
1790                        "node {node_id}: join node has fewer than two incoming edges"
1791                    ));
1792                }
1793            }
1794            "map" => {
1795                if node.map_policy.items.is_empty()
1796                    && node.map_policy.item_artifact_kind.is_none()
1797                    && node.input_contract.input_kinds.is_empty()
1798                {
1799                    errors.push(format!(
1800                        "node {node_id}: map nodes require items, item_artifact_kind, or input_contract.input_kinds"
1801                    ));
1802                }
1803            }
1804            "reduce" => {
1805                if node.input_contract.input_kinds.is_empty() {
1806                    warnings.push(format!(
1807                        "node {node_id}: reduce node has no input_contract.input_kinds; it will consume all available artifacts"
1808                    ));
1809                }
1810            }
1811            _ => {}
1812        }
1813    }
1814
1815    if let Some(ceiling) = ceiling {
1816        if let Err(error) = ceiling.intersect(&graph.capability_policy) {
1817            errors.push(error);
1818        }
1819        for (node_id, node) in &graph.nodes {
1820            if let Err(error) = ceiling.intersect(&node.capability_policy) {
1821                errors.push(format!("node {node_id}: {error}"));
1822            }
1823        }
1824    }
1825
1826    WorkflowValidationReport {
1827        valid: errors.is_empty(),
1828        errors,
1829        warnings,
1830        reachable_nodes: reachable_nodes.into_iter().collect(),
1831    }
1832}
1833
1834fn reachable_nodes(graph: &WorkflowGraph) -> BTreeSet<String> {
1835    let mut seen = BTreeSet::new();
1836    let mut stack = vec![graph.entry.clone()];
1837    while let Some(node_id) = stack.pop() {
1838        if !seen.insert(node_id.clone()) {
1839            continue;
1840        }
1841        for edge in graph.edges.iter().filter(|edge| edge.from == node_id) {
1842            stack.push(edge.to.clone());
1843        }
1844    }
1845    seen
1846}
1847
1848pub fn select_artifacts(
1849    mut artifacts: Vec<ArtifactRecord>,
1850    policy: &ContextPolicy,
1851) -> Vec<ArtifactRecord> {
1852    artifacts.retain(|artifact| {
1853        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
1854            && !policy.exclude_kinds.contains(&artifact.kind)
1855            && (policy.include_stages.is_empty()
1856                || artifact
1857                    .stage
1858                    .as_ref()
1859                    .is_some_and(|stage| policy.include_stages.contains(stage)))
1860    });
1861    artifacts.sort_by(|a, b| {
1862        let b_pinned = policy.pinned_ids.contains(&b.id);
1863        let a_pinned = policy.pinned_ids.contains(&a.id);
1864        b_pinned
1865            .cmp(&a_pinned)
1866            .then_with(|| {
1867                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
1868                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
1869                b_prio_kind.cmp(&a_prio_kind)
1870            })
1871            .then_with(|| {
1872                b.priority
1873                    .unwrap_or_default()
1874                    .cmp(&a.priority.unwrap_or_default())
1875            })
1876            .then_with(|| {
1877                if policy.prefer_fresh {
1878                    freshness_rank(b.freshness.as_deref())
1879                        .cmp(&freshness_rank(a.freshness.as_deref()))
1880                } else {
1881                    std::cmp::Ordering::Equal
1882                }
1883            })
1884            .then_with(|| {
1885                if policy.prefer_recent {
1886                    b.created_at.cmp(&a.created_at)
1887                } else {
1888                    std::cmp::Ordering::Equal
1889                }
1890            })
1891            .then_with(|| {
1892                b.relevance
1893                    .partial_cmp(&a.relevance)
1894                    .unwrap_or(std::cmp::Ordering::Equal)
1895            })
1896            .then_with(|| {
1897                a.estimated_tokens
1898                    .unwrap_or(usize::MAX)
1899                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
1900            })
1901    });
1902
1903    let mut selected = Vec::new();
1904    let mut used_tokens = 0usize;
1905    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
1906    let effective_max_tokens = policy
1907        .max_tokens
1908        .map(|max| max.saturating_sub(reserve_tokens));
1909    for artifact in artifacts {
1910        if let Some(max_artifacts) = policy.max_artifacts {
1911            if selected.len() >= max_artifacts {
1912                break;
1913            }
1914        }
1915        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
1916        if let Some(max_tokens) = effective_max_tokens {
1917            if used_tokens + next_tokens > max_tokens {
1918                continue;
1919            }
1920        }
1921        used_tokens += next_tokens;
1922        selected.push(artifact);
1923    }
1924    selected
1925}
1926
1927pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
1928    let mut parts = Vec::new();
1929    for artifact in artifacts {
1930        let title = artifact
1931            .title
1932            .clone()
1933            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
1934        let body = artifact
1935            .text
1936            .clone()
1937            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
1938            .unwrap_or_default();
1939        match policy.render.as_deref() {
1940            Some("json") => {
1941                parts.push(
1942                    serde_json::json!({
1943                        "id": artifact.id,
1944                        "kind": artifact.kind,
1945                        "title": title,
1946                        "source": artifact.source,
1947                        "freshness": artifact.freshness,
1948                        "priority": artifact.priority,
1949                        "text": body,
1950                    })
1951                    .to_string(),
1952                );
1953            }
1954            _ => parts.push(format!(
1955                "[{title}] kind={} source={} freshness={} priority={}\n{}",
1956                artifact.kind,
1957                artifact
1958                    .source
1959                    .clone()
1960                    .unwrap_or_else(|| "unknown".to_string()),
1961                artifact
1962                    .freshness
1963                    .clone()
1964                    .unwrap_or_else(|| "normal".to_string()),
1965                artifact.priority.unwrap_or_default(),
1966                body
1967            )),
1968        }
1969    }
1970    parts.join("\n\n")
1971}
1972
1973pub fn normalize_artifact(value: &VmValue) -> Result<ArtifactRecord, VmError> {
1974    let artifact: ArtifactRecord = parse_json_value(value)?;
1975    Ok(artifact.normalize())
1976}
1977
1978pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1979    let json = vm_value_to_json(value);
1980    let payload = json.to_string();
1981    let mut deserializer = serde_json::Deserializer::from_str(&payload);
1982    let mut tracker = serde_path_to_error::Track::new();
1983    let path_deserializer = serde_path_to_error::Deserializer::new(&mut deserializer, &mut tracker);
1984    let mut run: RunRecord = RunRecord::deserialize(path_deserializer).map_err(|error| {
1985        let snippet = if payload.len() > 600 {
1986            format!("{}...", &payload[..600])
1987        } else {
1988            payload.clone()
1989        };
1990        VmError::Runtime(format!(
1991            "orchestration parse error at {}: {} | payload={}",
1992            tracker.path(),
1993            error,
1994            snippet
1995        ))
1996    })?;
1997    if run.type_name.is_empty() {
1998        run.type_name = "run_record".to_string();
1999    }
2000    if run.id.is_empty() {
2001        run.id = new_id("run");
2002    }
2003    if run.started_at.is_empty() {
2004        run.started_at = now_rfc3339();
2005    }
2006    if run.status.is_empty() {
2007        run.status = "running".to_string();
2008    }
2009    if run.root_run_id.is_none() {
2010        run.root_run_id = Some(run.id.clone());
2011    }
2012    if run.replay_fixture.is_none() {
2013        run.replay_fixture = Some(replay_fixture_from_run(&run));
2014    }
2015    Ok(run)
2016}
2017
2018pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
2019    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
2020    if manifest.type_name.is_empty() {
2021        manifest.type_name = "eval_suite_manifest".to_string();
2022    }
2023    if manifest.id.is_empty() {
2024        manifest.id = new_id("eval_suite");
2025    }
2026    Ok(manifest)
2027}
2028
2029fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
2030    let content = std::fs::read_to_string(path)
2031        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
2032    serde_json::from_str(&content)
2033        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
2034}
2035
2036fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
2037    let path_buf = PathBuf::from(path);
2038    if path_buf.is_absolute() {
2039        path_buf
2040    } else if let Some(base_dir) = base_dir {
2041        base_dir.join(path_buf)
2042    } else {
2043        path_buf
2044    }
2045}
2046
2047pub fn evaluate_run_suite_manifest(
2048    manifest: &EvalSuiteManifest,
2049) -> Result<ReplayEvalSuiteReport, VmError> {
2050    let base_dir = manifest.base_dir.as_deref().map(Path::new);
2051    let mut reports = Vec::new();
2052    for case in &manifest.cases {
2053        let run_path = resolve_manifest_path(base_dir, &case.run_path);
2054        let run = load_run_record(&run_path)?;
2055        let fixture = match &case.fixture_path {
2056            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
2057            None => run
2058                .replay_fixture
2059                .clone()
2060                .unwrap_or_else(|| replay_fixture_from_run(&run)),
2061        };
2062        let eval = evaluate_run_against_fixture(&run, &fixture);
2063        let mut pass = eval.pass;
2064        let mut failures = eval.failures;
2065        let comparison = match &case.compare_to {
2066            Some(path) => {
2067                let baseline_path = resolve_manifest_path(base_dir, path);
2068                let baseline = load_run_record(&baseline_path)?;
2069                let diff = diff_run_records(&baseline, &run);
2070                if !diff.identical {
2071                    pass = false;
2072                    failures.push(format!(
2073                        "run differs from baseline {} with {} stage changes",
2074                        baseline_path.display(),
2075                        diff.stage_diffs.len()
2076                    ));
2077                }
2078                Some(diff)
2079            }
2080            None => None,
2081        };
2082        reports.push(ReplayEvalCaseReport {
2083            run_id: run.id.clone(),
2084            workflow_id: run.workflow_id.clone(),
2085            label: case.label.clone(),
2086            pass,
2087            failures,
2088            stage_count: eval.stage_count,
2089            source_path: Some(run_path.display().to_string()),
2090            comparison,
2091        });
2092    }
2093    let total = reports.len();
2094    let passed = reports.iter().filter(|report| report.pass).count();
2095    let failed = total.saturating_sub(passed);
2096    Ok(ReplayEvalSuiteReport {
2097        pass: failed == 0,
2098        total,
2099        passed,
2100        failed,
2101        cases: reports,
2102    })
2103}
2104
2105pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2106    let before_lines: Vec<&str> = before.lines().collect();
2107    let after_lines: Vec<&str> = after.lines().collect();
2108    let mut table = vec![vec![0usize; after_lines.len() + 1]; before_lines.len() + 1];
2109    for i in (0..before_lines.len()).rev() {
2110        for j in (0..after_lines.len()).rev() {
2111            table[i][j] = if before_lines[i] == after_lines[j] {
2112                table[i + 1][j + 1] + 1
2113            } else {
2114                table[i + 1][j].max(table[i][j + 1])
2115            };
2116        }
2117    }
2118
2119    let mut diff = String::new();
2120    let file = path.unwrap_or("artifact");
2121    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2122    let mut i = 0;
2123    let mut j = 0;
2124    while i < before_lines.len() && j < after_lines.len() {
2125        if before_lines[i] == after_lines[j] {
2126            diff.push_str(&format!(" {}\n", before_lines[i]));
2127            i += 1;
2128            j += 1;
2129        } else if table[i + 1][j] >= table[i][j + 1] {
2130            diff.push_str(&format!("-{}\n", before_lines[i]));
2131            i += 1;
2132        } else {
2133            diff.push_str(&format!("+{}\n", after_lines[j]));
2134            j += 1;
2135        }
2136    }
2137    while i < before_lines.len() {
2138        diff.push_str(&format!("-{}\n", before_lines[i]));
2139        i += 1;
2140    }
2141    while j < after_lines.len() {
2142        diff.push_str(&format!("+{}\n", after_lines[j]));
2143        j += 1;
2144    }
2145    diff
2146}
2147
2148pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2149    let path = path
2150        .map(PathBuf::from)
2151        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2152    if let Some(parent) = path.parent() {
2153        std::fs::create_dir_all(parent)
2154            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2155    }
2156    let json = serde_json::to_string_pretty(run)
2157        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2158    // Atomic write: write to .tmp then rename to prevent corruption on kill.
2159    let tmp_path = path.with_extension("json.tmp");
2160    std::fs::write(&tmp_path, &json)
2161        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2162    std::fs::rename(&tmp_path, &path).map_err(|e| {
2163        // Fallback: if rename fails (cross-device), write directly.
2164        let _ = std::fs::write(&path, &json);
2165        VmError::Runtime(format!("failed to finalize run record: {e}"))
2166    })?;
2167    Ok(path.to_string_lossy().to_string())
2168}
2169
2170pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2171    let content = std::fs::read_to_string(path)
2172        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2173    serde_json::from_str(&content)
2174        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))
2175}
2176
2177pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2178    ReplayFixture {
2179        type_name: "replay_fixture".to_string(),
2180        id: new_id("fixture"),
2181        source_run_id: run.id.clone(),
2182        workflow_id: run.workflow_id.clone(),
2183        workflow_name: run.workflow_name.clone(),
2184        created_at: now_rfc3339(),
2185        expected_status: run.status.clone(),
2186        stage_assertions: run
2187            .stages
2188            .iter()
2189            .map(|stage| ReplayStageAssertion {
2190                node_id: stage.node_id.clone(),
2191                expected_status: stage.status.clone(),
2192                expected_outcome: stage.outcome.clone(),
2193                expected_branch: stage.branch.clone(),
2194                required_artifact_kinds: stage
2195                    .artifacts
2196                    .iter()
2197                    .map(|artifact| artifact.kind.clone())
2198                    .collect(),
2199                visible_text_contains: stage
2200                    .visible_text
2201                    .as_ref()
2202                    .filter(|text| !text.is_empty())
2203                    .map(|text| text.chars().take(80).collect()),
2204            })
2205            .collect(),
2206    }
2207}
2208
2209pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2210    let mut failures = Vec::new();
2211    if run.status != fixture.expected_status {
2212        failures.push(format!(
2213            "run status mismatch: expected {}, got {}",
2214            fixture.expected_status, run.status
2215        ));
2216    }
2217    for assertion in &fixture.stage_assertions {
2218        let Some(stage) = run
2219            .stages
2220            .iter()
2221            .find(|stage| stage.node_id == assertion.node_id)
2222        else {
2223            failures.push(format!("missing stage {}", assertion.node_id));
2224            continue;
2225        };
2226        if stage.status != assertion.expected_status {
2227            failures.push(format!(
2228                "stage {} status mismatch: expected {}, got {}",
2229                assertion.node_id, assertion.expected_status, stage.status
2230            ));
2231        }
2232        if stage.outcome != assertion.expected_outcome {
2233            failures.push(format!(
2234                "stage {} outcome mismatch: expected {}, got {}",
2235                assertion.node_id, assertion.expected_outcome, stage.outcome
2236            ));
2237        }
2238        if stage.branch != assertion.expected_branch {
2239            failures.push(format!(
2240                "stage {} branch mismatch: expected {:?}, got {:?}",
2241                assertion.node_id, assertion.expected_branch, stage.branch
2242            ));
2243        }
2244        for required_kind in &assertion.required_artifact_kinds {
2245            if !stage
2246                .artifacts
2247                .iter()
2248                .any(|artifact| &artifact.kind == required_kind)
2249            {
2250                failures.push(format!(
2251                    "stage {} missing artifact kind {}",
2252                    assertion.node_id, required_kind
2253                ));
2254            }
2255        }
2256        if let Some(snippet) = &assertion.visible_text_contains {
2257            let actual = stage.visible_text.clone().unwrap_or_default();
2258            if !actual.contains(snippet) {
2259                failures.push(format!(
2260                    "stage {} visible text does not contain expected snippet {:?}",
2261                    assertion.node_id, snippet
2262                ));
2263            }
2264        }
2265    }
2266
2267    ReplayEvalReport {
2268        pass: failures.is_empty(),
2269        failures,
2270        stage_count: run.stages.len(),
2271    }
2272}
2273
2274pub fn evaluate_run_suite(
2275    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2276) -> ReplayEvalSuiteReport {
2277    let mut reports = Vec::new();
2278    for (run, fixture, source_path) in cases {
2279        let report = evaluate_run_against_fixture(&run, &fixture);
2280        reports.push(ReplayEvalCaseReport {
2281            run_id: run.id.clone(),
2282            workflow_id: run.workflow_id.clone(),
2283            label: None,
2284            pass: report.pass,
2285            failures: report.failures,
2286            stage_count: report.stage_count,
2287            source_path,
2288            comparison: None,
2289        });
2290    }
2291    let total = reports.len();
2292    let passed = reports.iter().filter(|report| report.pass).count();
2293    let failed = total.saturating_sub(passed);
2294    ReplayEvalSuiteReport {
2295        pass: failed == 0,
2296        total,
2297        passed,
2298        failed,
2299        cases: reports,
2300    }
2301}
2302
2303pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2304    let mut stage_diffs = Vec::new();
2305    let mut all_node_ids = BTreeSet::new();
2306    all_node_ids.extend(left.stages.iter().map(|stage| stage.node_id.clone()));
2307    all_node_ids.extend(right.stages.iter().map(|stage| stage.node_id.clone()));
2308
2309    for node_id in all_node_ids {
2310        let left_stage = left.stages.iter().find(|stage| stage.node_id == node_id);
2311        let right_stage = right.stages.iter().find(|stage| stage.node_id == node_id);
2312        match (left_stage, right_stage) {
2313            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2314                node_id,
2315                change: "removed".to_string(),
2316                details: vec!["stage missing from right run".to_string()],
2317            }),
2318            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2319                node_id,
2320                change: "added".to_string(),
2321                details: vec!["stage missing from left run".to_string()],
2322            }),
2323            (Some(left_stage), Some(right_stage)) => {
2324                let mut details = Vec::new();
2325                if left_stage.status != right_stage.status {
2326                    details.push(format!(
2327                        "status: {} -> {}",
2328                        left_stage.status, right_stage.status
2329                    ));
2330                }
2331                if left_stage.outcome != right_stage.outcome {
2332                    details.push(format!(
2333                        "outcome: {} -> {}",
2334                        left_stage.outcome, right_stage.outcome
2335                    ));
2336                }
2337                if left_stage.branch != right_stage.branch {
2338                    details.push(format!(
2339                        "branch: {:?} -> {:?}",
2340                        left_stage.branch, right_stage.branch
2341                    ));
2342                }
2343                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2344                {
2345                    details.push(format!(
2346                        "produced_artifacts: {} -> {}",
2347                        left_stage.produced_artifact_ids.len(),
2348                        right_stage.produced_artifact_ids.len()
2349                    ));
2350                }
2351                if left_stage.artifacts.len() != right_stage.artifacts.len() {
2352                    details.push(format!(
2353                        "artifact_records: {} -> {}",
2354                        left_stage.artifacts.len(),
2355                        right_stage.artifacts.len()
2356                    ));
2357                }
2358                if !details.is_empty() {
2359                    stage_diffs.push(RunStageDiffRecord {
2360                        node_id,
2361                        change: "changed".to_string(),
2362                        details,
2363                    });
2364                }
2365            }
2366            (None, None) => {}
2367        }
2368    }
2369
2370    let status_changed = left.status != right.status;
2371    let identical = !status_changed
2372        && stage_diffs.is_empty()
2373        && left.transitions.len() == right.transitions.len()
2374        && left.artifacts.len() == right.artifacts.len()
2375        && left.checkpoints.len() == right.checkpoints.len();
2376
2377    RunDiffReport {
2378        left_run_id: left.id.clone(),
2379        right_run_id: right.id.clone(),
2380        identical,
2381        status_changed,
2382        left_status: left.status.clone(),
2383        right_status: right.status.clone(),
2384        stage_diffs,
2385        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
2386        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
2387        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
2388    }
2389}
2390
2391pub fn push_execution_policy(policy: CapabilityPolicy) {
2392    EXECUTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
2393}
2394
2395pub fn pop_execution_policy() {
2396    EXECUTION_POLICY_STACK.with(|stack| {
2397        stack.borrow_mut().pop();
2398    });
2399}
2400
2401pub fn current_execution_policy() -> Option<CapabilityPolicy> {
2402    EXECUTION_POLICY_STACK.with(|stack| stack.borrow().last().cloned())
2403}
2404
2405pub fn current_tool_metadata(tool: &str) -> Option<ToolRuntimePolicyMetadata> {
2406    current_execution_policy().and_then(|policy| policy.tool_metadata.get(tool).cloned())
2407}
2408
2409fn policy_allows_tool(policy: &CapabilityPolicy, tool: &str) -> bool {
2410    policy.tools.is_empty() || policy.tools.iter().any(|allowed| allowed == tool)
2411}
2412
2413fn policy_allows_capability(policy: &CapabilityPolicy, capability: &str, op: &str) -> bool {
2414    policy.capabilities.is_empty()
2415        || policy
2416            .capabilities
2417            .get(capability)
2418            .is_some_and(|ops| ops.is_empty() || ops.iter().any(|allowed| allowed == op))
2419}
2420
2421fn policy_allows_side_effect(policy: &CapabilityPolicy, requested: &str) -> bool {
2422    fn rank(v: &str) -> usize {
2423        match v {
2424            "none" => 0,
2425            "read_only" => 1,
2426            "workspace_write" => 2,
2427            "process_exec" => 3,
2428            "network" => 4,
2429            _ => 5,
2430        }
2431    }
2432    policy
2433        .side_effect_level
2434        .as_ref()
2435        .map(|allowed| rank(allowed) >= rank(requested))
2436        .unwrap_or(true)
2437}
2438
2439fn reject_policy(reason: String) -> Result<(), VmError> {
2440    Err(VmError::CategorizedError {
2441        message: reason,
2442        category: crate::value::ErrorCategory::ToolRejected,
2443    })
2444}
2445
2446fn fallback_mutation_classification(tool_name: &str) -> String {
2447    let lower = tool_name.to_ascii_lowercase();
2448    if lower.starts_with("mcp_") {
2449        return "host_defined".to_string();
2450    }
2451    if lower == "exec"
2452        || lower == "shell"
2453        || lower == "exec_at"
2454        || lower == "shell_at"
2455        || lower == "run"
2456        || lower.starts_with("run_")
2457    {
2458        return "ambient_side_effect".to_string();
2459    }
2460    if lower.starts_with("delete")
2461        || lower.starts_with("remove")
2462        || lower.starts_with("move")
2463        || lower.starts_with("rename")
2464    {
2465        return "destructive".to_string();
2466    }
2467    if lower.contains("write")
2468        || lower.contains("edit")
2469        || lower.contains("patch")
2470        || lower.contains("create")
2471        || lower.contains("scaffold")
2472        || lower.starts_with("insert")
2473        || lower.starts_with("replace")
2474        || lower == "add_import"
2475    {
2476        return "apply_workspace".to_string();
2477    }
2478    "read_only".to_string()
2479}
2480
2481pub fn current_tool_mutation_classification(tool_name: &str) -> String {
2482    current_tool_metadata(tool_name)
2483        .and_then(|metadata| metadata.mutation_classification)
2484        .unwrap_or_else(|| fallback_mutation_classification(tool_name))
2485}
2486
2487pub fn current_tool_declared_paths(tool_name: &str, args: &serde_json::Value) -> Vec<String> {
2488    let Some(map) = args.as_object() else {
2489        return Vec::new();
2490    };
2491    let path_keys = current_tool_metadata(tool_name)
2492        .map(|metadata| metadata.path_params)
2493        .filter(|keys| !keys.is_empty())
2494        .unwrap_or_else(|| {
2495            vec![
2496                "path".to_string(),
2497                "file".to_string(),
2498                "cwd".to_string(),
2499                "repo".to_string(),
2500                "target".to_string(),
2501                "destination".to_string(),
2502            ]
2503        });
2504    let mut paths = Vec::new();
2505    for key in path_keys {
2506        if let Some(value) = map.get(&key).and_then(|value| value.as_str()) {
2507            if !value.is_empty() {
2508                paths.push(value.to_string());
2509            }
2510        }
2511    }
2512    if let Some(items) = map.get("paths").and_then(|value| value.as_array()) {
2513        for item in items {
2514            if let Some(value) = item.as_str() {
2515                if !value.is_empty() {
2516                    paths.push(value.to_string());
2517                }
2518            }
2519        }
2520    }
2521    paths.sort();
2522    paths.dedup();
2523    paths
2524}
2525
2526pub fn enforce_current_policy_for_builtin(name: &str, args: &[VmValue]) -> Result<(), VmError> {
2527    let Some(policy) = current_execution_policy() else {
2528        return Ok(());
2529    };
2530    match name {
2531        "read" | "read_file" => {
2532            if !policy_allows_tool(&policy, name)
2533                || !policy_allows_capability(&policy, "workspace", "read_text")
2534            {
2535                return reject_policy(format!(
2536                    "builtin '{name}' exceeds workspace.read_text ceiling"
2537                ));
2538            }
2539        }
2540        "search" | "list_dir" => {
2541            if !policy_allows_tool(&policy, name)
2542                || !policy_allows_capability(&policy, "workspace", "list")
2543            {
2544                return reject_policy(format!("builtin '{name}' exceeds workspace.list ceiling"));
2545            }
2546        }
2547        "file_exists" | "stat" => {
2548            if !policy_allows_capability(&policy, "workspace", "exists") {
2549                return reject_policy(format!("builtin '{name}' exceeds workspace.exists ceiling"));
2550            }
2551        }
2552        "edit" | "write_file" | "append_file" | "mkdir" | "copy_file" => {
2553            if !policy_allows_tool(&policy, "edit")
2554                || !policy_allows_capability(&policy, "workspace", "write_text")
2555                || !policy_allows_side_effect(&policy, "workspace_write")
2556            {
2557                return reject_policy(format!("builtin '{name}' exceeds workspace write ceiling"));
2558            }
2559        }
2560        "delete_file" => {
2561            if !policy_allows_capability(&policy, "workspace", "delete")
2562                || !policy_allows_side_effect(&policy, "workspace_write")
2563            {
2564                return reject_policy(
2565                    "builtin 'delete_file' exceeds workspace.delete ceiling".to_string(),
2566                );
2567            }
2568        }
2569        "apply_edit" => {
2570            if !policy_allows_capability(&policy, "workspace", "apply_edit")
2571                || !policy_allows_side_effect(&policy, "workspace_write")
2572            {
2573                return reject_policy(
2574                    "builtin 'apply_edit' exceeds workspace.apply_edit ceiling".to_string(),
2575                );
2576            }
2577        }
2578        "exec" | "exec_at" | "shell" | "shell_at" | "run_command" => {
2579            if !policy_allows_tool(&policy, "run")
2580                || !policy_allows_capability(&policy, "process", "exec")
2581                || !policy_allows_side_effect(&policy, "process_exec")
2582            {
2583                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2584            }
2585        }
2586        "http_get" | "http_post" | "http_put" | "http_patch" | "http_delete" | "http_request" => {
2587            if !policy_allows_side_effect(&policy, "network") {
2588                return reject_policy(format!("builtin '{name}' exceeds network ceiling"));
2589            }
2590        }
2591        "mcp_connect"
2592        | "mcp_call"
2593        | "mcp_list_tools"
2594        | "mcp_list_resources"
2595        | "mcp_list_resource_templates"
2596        | "mcp_read_resource"
2597        | "mcp_list_prompts"
2598        | "mcp_get_prompt"
2599        | "mcp_server_info"
2600        | "mcp_disconnect" => {
2601            if !policy_allows_tool(&policy, "run")
2602                || !policy_allows_capability(&policy, "process", "exec")
2603                || !policy_allows_side_effect(&policy, "process_exec")
2604            {
2605                return reject_policy(format!("builtin '{name}' exceeds process.exec ceiling"));
2606            }
2607        }
2608        "host_call" => {
2609            let name = args.first().map(|v| v.display()).unwrap_or_default();
2610            let Some((capability, op)) = name.split_once('.') else {
2611                return reject_policy(format!(
2612                    "host_call '{name}' must use capability.operation naming"
2613                ));
2614            };
2615            if !policy_allows_capability(&policy, capability, op) {
2616                return reject_policy(format!(
2617                    "host_call {capability}.{op} exceeds capability ceiling"
2618                ));
2619            }
2620            let requested_side_effect = match (capability, op) {
2621                ("workspace", "write_text" | "apply_edit" | "delete") => "workspace_write",
2622                ("process", "exec") => "process_exec",
2623                _ => "read_only",
2624            };
2625            if !policy_allows_side_effect(&policy, requested_side_effect) {
2626                return reject_policy(format!(
2627                    "host_call {capability}.{op} exceeds side-effect ceiling"
2628                ));
2629            }
2630        }
2631        _ => {}
2632    }
2633    Ok(())
2634}
2635
2636pub fn enforce_current_policy_for_bridge_builtin(name: &str) -> Result<(), VmError> {
2637    if current_execution_policy().is_some() {
2638        return reject_policy(format!(
2639            "bridged builtin '{name}' exceeds execution policy; declare an explicit capability/tool surface instead"
2640        ));
2641    }
2642    Ok(())
2643}
2644
2645pub fn enforce_current_policy_for_tool(tool_name: &str) -> Result<(), VmError> {
2646    let Some(policy) = current_execution_policy() else {
2647        return Ok(());
2648    };
2649    if !policy_allows_tool(&policy, tool_name) {
2650        return reject_policy(format!("tool '{tool_name}' exceeds tool ceiling"));
2651    }
2652    if let Some(metadata) = policy.tool_metadata.get(tool_name) {
2653        for (capability, ops) in &metadata.capabilities {
2654            for op in ops {
2655                if !policy_allows_capability(&policy, capability, op) {
2656                    return reject_policy(format!(
2657                        "tool '{tool_name}' exceeds capability ceiling: {capability}.{op}"
2658                    ));
2659                }
2660            }
2661        }
2662        if let Some(side_effect_level) = metadata.side_effect_level.as_deref() {
2663            if !policy_allows_side_effect(&policy, side_effect_level) {
2664                return reject_policy(format!(
2665                    "tool '{tool_name}' exceeds side-effect ceiling: {side_effect_level}"
2666                ));
2667            }
2668        }
2669    }
2670    Ok(())
2671}
2672
2673fn compact_transcript(transcript: &VmValue, keep_last: usize) -> Option<VmValue> {
2674    let dict = transcript.as_dict()?;
2675    let messages = match dict.get("messages") {
2676        Some(VmValue::List(list)) => list.iter().cloned().collect::<Vec<_>>(),
2677        _ => Vec::new(),
2678    };
2679    let retained = messages
2680        .into_iter()
2681        .rev()
2682        .take(keep_last)
2683        .collect::<Vec<_>>()
2684        .into_iter()
2685        .rev()
2686        .collect::<Vec<_>>();
2687    let mut compacted = dict.clone();
2688    compacted.insert(
2689        "messages".to_string(),
2690        VmValue::List(Rc::new(retained.clone())),
2691    );
2692    compacted.insert(
2693        "events".to_string(),
2694        VmValue::List(Rc::new(
2695            crate::llm::helpers::transcript_events_from_messages(&retained),
2696        )),
2697    );
2698    Some(VmValue::Dict(Rc::new(compacted)))
2699}
2700
2701fn redact_transcript_visibility(transcript: &VmValue, visibility: Option<&str>) -> Option<VmValue> {
2702    let Some(visibility) = visibility else {
2703        return Some(transcript.clone());
2704    };
2705    if visibility != "public" && visibility != "public_only" {
2706        return Some(transcript.clone());
2707    }
2708    let dict = transcript.as_dict()?;
2709    let public_messages = match dict.get("messages") {
2710        Some(VmValue::List(list)) => list
2711            .iter()
2712            .filter(|message| {
2713                message
2714                    .as_dict()
2715                    .and_then(|d| d.get("role"))
2716                    .map(|v| v.display())
2717                    .map(|role| role != "tool_result")
2718                    .unwrap_or(true)
2719            })
2720            .cloned()
2721            .collect::<Vec<_>>(),
2722        _ => Vec::new(),
2723    };
2724    let public_events = match dict.get("events") {
2725        Some(VmValue::List(list)) => list
2726            .iter()
2727            .filter(|event| {
2728                event
2729                    .as_dict()
2730                    .and_then(|d| d.get("visibility"))
2731                    .map(|v| v.display())
2732                    .map(|value| value == "public")
2733                    .unwrap_or(true)
2734            })
2735            .cloned()
2736            .collect::<Vec<_>>(),
2737        _ => Vec::new(),
2738    };
2739    let mut redacted = dict.clone();
2740    redacted.insert(
2741        "messages".to_string(),
2742        VmValue::List(Rc::new(public_messages)),
2743    );
2744    redacted.insert("events".to_string(), VmValue::List(Rc::new(public_events)));
2745    Some(VmValue::Dict(Rc::new(redacted)))
2746}
2747
2748pub(crate) fn apply_input_transcript_policy(
2749    transcript: Option<VmValue>,
2750    policy: &TranscriptPolicy,
2751) -> Option<VmValue> {
2752    let mut transcript = transcript;
2753    match policy.mode.as_deref() {
2754        Some("reset") => return None,
2755        Some("fork") => {
2756            if let Some(VmValue::Dict(dict)) = transcript.as_ref() {
2757                let mut forked = dict.as_ref().clone();
2758                forked.insert(
2759                    "id".to_string(),
2760                    VmValue::String(Rc::from(new_id("transcript"))),
2761                );
2762                transcript = Some(VmValue::Dict(Rc::new(forked)));
2763            }
2764        }
2765        _ => {}
2766    }
2767    if policy.compact {
2768        let keep_last = policy.keep_last.unwrap_or(6);
2769        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2770    }
2771    transcript
2772}
2773
2774fn apply_output_transcript_policy(
2775    transcript: Option<VmValue>,
2776    policy: &TranscriptPolicy,
2777) -> Option<VmValue> {
2778    let mut transcript = transcript;
2779    if policy.compact {
2780        let keep_last = policy.keep_last.unwrap_or(6);
2781        transcript = transcript.and_then(|value| compact_transcript(&value, keep_last));
2782    }
2783    transcript.and_then(|value| redact_transcript_visibility(&value, policy.visibility.as_deref()))
2784}
2785
2786pub async fn execute_stage_node(
2787    node_id: &str,
2788    node: &WorkflowNode,
2789    task: &str,
2790    artifacts: &[ArtifactRecord],
2791    transcript: Option<VmValue>,
2792) -> Result<(serde_json::Value, Vec<ArtifactRecord>, Option<VmValue>), VmError> {
2793    let mut selection_policy = node.context_policy.clone();
2794    if selection_policy.include_kinds.is_empty() && !node.input_contract.input_kinds.is_empty() {
2795        selection_policy.include_kinds = node.input_contract.input_kinds.clone();
2796    }
2797    let selected = select_artifacts_adaptive(artifacts.to_vec(), &selection_policy);
2798    let rendered_context = render_artifacts_context(&selected, &node.context_policy);
2799    let transcript = apply_input_transcript_policy(transcript, &node.transcript_policy);
2800    if node.input_contract.require_transcript && transcript.is_none() {
2801        return Err(VmError::Runtime(format!(
2802            "workflow stage {node_id} requires transcript input"
2803        )));
2804    }
2805    if let Some(min_inputs) = node.input_contract.min_inputs {
2806        if selected.len() < min_inputs {
2807            return Err(VmError::Runtime(format!(
2808                "workflow stage {node_id} requires at least {min_inputs} input artifacts"
2809            )));
2810        }
2811    }
2812    if let Some(max_inputs) = node.input_contract.max_inputs {
2813        if selected.len() > max_inputs {
2814            return Err(VmError::Runtime(format!(
2815                "workflow stage {node_id} accepts at most {max_inputs} input artifacts"
2816            )));
2817        }
2818    }
2819    let prompt = if rendered_context.is_empty() {
2820        task.to_string()
2821    } else {
2822        format!(
2823            "{rendered_context}\n\n{}:\n{task}",
2824            node.task_label
2825                .clone()
2826                .unwrap_or_else(|| "Task".to_string())
2827        )
2828    };
2829
2830    let tool_format = std::env::var("HARN_AGENT_TOOL_FORMAT")
2831        .ok()
2832        .filter(|value| !value.trim().is_empty())
2833        .unwrap_or_else(|| "text".to_string());
2834    let mut llm_result = if node.kind == "verify" {
2835        if let Some(command) = node
2836            .verify
2837            .as_ref()
2838            .and_then(|verify| verify.as_object())
2839            .and_then(|verify| verify.get("command"))
2840            .and_then(|value| value.as_str())
2841            .map(str::trim)
2842            .filter(|value| !value.is_empty())
2843        {
2844            let mut process = if cfg!(target_os = "windows") {
2845                let mut cmd = tokio::process::Command::new("cmd");
2846                cmd.arg("/C").arg(command);
2847                cmd
2848            } else {
2849                let mut cmd = tokio::process::Command::new("/bin/sh");
2850                cmd.arg("-lc").arg(command);
2851                cmd
2852            };
2853            process.stdin(std::process::Stdio::null());
2854            if let Some(context) = crate::stdlib::process::current_execution_context() {
2855                if let Some(cwd) = context.cwd.filter(|cwd| !cwd.is_empty()) {
2856                    process.current_dir(cwd);
2857                }
2858                if !context.env.is_empty() {
2859                    process.envs(context.env);
2860                }
2861            }
2862            let output = process
2863                .output()
2864                .await
2865                .map_err(|e| VmError::Runtime(format!("workflow verify exec failed: {e}")))?;
2866            let stdout = String::from_utf8_lossy(&output.stdout).to_string();
2867            let stderr = String::from_utf8_lossy(&output.stderr).to_string();
2868            let combined = if stderr.is_empty() {
2869                stdout.clone()
2870            } else if stdout.is_empty() {
2871                stderr.clone()
2872            } else {
2873                format!("{stdout}\n{stderr}")
2874            };
2875            serde_json::json!({
2876                "status": "completed",
2877                "text": combined,
2878                "visible_text": combined,
2879                "command": command,
2880                "stdout": stdout,
2881                "stderr": stderr,
2882                "exit_status": output.status.code().unwrap_or(-1),
2883                "success": output.status.success(),
2884            })
2885        } else {
2886            serde_json::json!({
2887                "status": "completed",
2888                "text": "",
2889                "visible_text": "",
2890            })
2891        }
2892    } else {
2893        let mut options = BTreeMap::new();
2894        if let Some(provider) = &node.model_policy.provider {
2895            options.insert(
2896                "provider".to_string(),
2897                VmValue::String(Rc::from(provider.clone())),
2898            );
2899        }
2900        if let Some(model) = &node.model_policy.model {
2901            options.insert(
2902                "model".to_string(),
2903                VmValue::String(Rc::from(model.clone())),
2904            );
2905        }
2906        if let Some(model_tier) = &node.model_policy.model_tier {
2907            options.insert(
2908                "model_tier".to_string(),
2909                VmValue::String(Rc::from(model_tier.clone())),
2910            );
2911        }
2912        if let Some(temperature) = node.model_policy.temperature {
2913            options.insert("temperature".to_string(), VmValue::Float(temperature));
2914        }
2915        if let Some(max_tokens) = node.model_policy.max_tokens {
2916            options.insert("max_tokens".to_string(), VmValue::Int(max_tokens));
2917        }
2918        let tool_names = workflow_tool_names(&node.tools);
2919        if !matches!(node.tools, serde_json::Value::Null) && !tool_names.is_empty() {
2920            options.insert(
2921                "tools".to_string(),
2922                crate::stdlib::json_to_vm_value(&node.tools),
2923            );
2924        }
2925        if let Some(transcript) = transcript.clone() {
2926            options.insert("transcript".to_string(), transcript);
2927        }
2928
2929        let args = vec![
2930            VmValue::String(Rc::from(prompt.clone())),
2931            node.system
2932                .clone()
2933                .map(|s| VmValue::String(Rc::from(s)))
2934                .unwrap_or(VmValue::Nil),
2935            VmValue::Dict(Rc::new(options)),
2936        ];
2937        let mut opts = extract_llm_options(&args)?;
2938
2939        if node.mode.as_deref() == Some("agent") || !tool_names.is_empty() {
2940            crate::llm::run_agent_loop_internal(
2941                &mut opts,
2942                crate::llm::AgentLoopConfig {
2943                    persistent: true,
2944                    max_iterations: 12,
2945                    max_nudges: 3,
2946                    nudge: None,
2947                    done_sentinel: node.done_sentinel.clone(),
2948                    break_unless_phase: None,
2949                    tool_retries: 0,
2950                    tool_backoff_ms: 1000,
2951                    tool_format: tool_format.clone(),
2952                    auto_compact: None,
2953                    context_callback: None,
2954                    policy: None,
2955                    daemon: false,
2956                    llm_retries: 2,
2957                    llm_backoff_ms: 2000,
2958                },
2959            )
2960            .await?
2961        } else {
2962            let result = vm_call_llm_full(&opts).await?;
2963            crate::llm::agent_loop_result_from_llm(&result, opts)
2964        }
2965    };
2966    if let Some(payload) = llm_result.as_object_mut() {
2967        payload.insert("prompt".to_string(), serde_json::json!(prompt));
2968        payload.insert(
2969            "system_prompt".to_string(),
2970            serde_json::json!(node.system.clone().unwrap_or_default()),
2971        );
2972        payload.insert(
2973            "rendered_context".to_string(),
2974            serde_json::json!(rendered_context),
2975        );
2976        payload.insert(
2977            "selected_artifact_ids".to_string(),
2978            serde_json::json!(selected
2979                .iter()
2980                .map(|artifact| artifact.id.clone())
2981                .collect::<Vec<_>>()),
2982        );
2983        payload.insert(
2984            "selected_artifact_titles".to_string(),
2985            serde_json::json!(selected
2986                .iter()
2987                .map(|artifact| artifact.title.clone())
2988                .collect::<Vec<_>>()),
2989        );
2990        payload.insert(
2991            "tool_calling_mode".to_string(),
2992            serde_json::json!(tool_format.clone()),
2993        );
2994    }
2995
2996    let visible_text = llm_result["text"].as_str().unwrap_or_default().to_string();
2997    let transcript = llm_result
2998        .get("transcript")
2999        .cloned()
3000        .map(|value| crate::stdlib::json_to_vm_value(&value));
3001    let transcript = apply_output_transcript_policy(transcript, &node.transcript_policy);
3002    let output_kind = node
3003        .output_contract
3004        .output_kinds
3005        .first()
3006        .cloned()
3007        .unwrap_or_else(|| {
3008            if node.kind == "verify" {
3009                "verification_result".to_string()
3010            } else {
3011                "artifact".to_string()
3012            }
3013        });
3014    let mut metadata = BTreeMap::new();
3015    metadata.insert(
3016        "input_artifact_ids".to_string(),
3017        serde_json::json!(selected
3018            .iter()
3019            .map(|artifact| artifact.id.clone())
3020            .collect::<Vec<_>>()),
3021    );
3022    metadata.insert("node_kind".to_string(), serde_json::json!(node.kind));
3023    let artifact = ArtifactRecord {
3024        type_name: "artifact".to_string(),
3025        id: new_id("artifact"),
3026        kind: output_kind,
3027        title: Some(format!("stage {node_id} output")),
3028        text: Some(visible_text),
3029        data: Some(llm_result.clone()),
3030        source: Some(node_id.to_string()),
3031        created_at: now_rfc3339(),
3032        freshness: Some("fresh".to_string()),
3033        priority: None,
3034        lineage: selected
3035            .iter()
3036            .map(|artifact| artifact.id.clone())
3037            .collect(),
3038        relevance: Some(1.0),
3039        estimated_tokens: None,
3040        stage: Some(node_id.to_string()),
3041        metadata,
3042    }
3043    .normalize();
3044
3045    Ok((llm_result, vec![artifact], transcript))
3046}
3047
3048pub fn next_nodes_for(
3049    graph: &WorkflowGraph,
3050    current: &str,
3051    branch: Option<&str>,
3052) -> Vec<WorkflowEdge> {
3053    let mut matching: Vec<WorkflowEdge> = graph
3054        .edges
3055        .iter()
3056        .filter(|edge| edge.from == current && edge.branch.as_deref() == branch)
3057        .cloned()
3058        .collect();
3059    if matching.is_empty() {
3060        matching = graph
3061            .edges
3062            .iter()
3063            .filter(|edge| edge.from == current && edge.branch.is_none())
3064            .cloned()
3065            .collect();
3066    }
3067    matching
3068}
3069
3070pub fn next_node_for(graph: &WorkflowGraph, current: &str, branch: &str) -> Option<String> {
3071    next_nodes_for(graph, current, Some(branch))
3072        .into_iter()
3073        .next()
3074        .map(|edge| edge.to)
3075}
3076
3077pub fn append_audit_entry(
3078    graph: &mut WorkflowGraph,
3079    op: &str,
3080    node_id: Option<String>,
3081    reason: Option<String>,
3082    metadata: BTreeMap<String, serde_json::Value>,
3083) {
3084    graph.audit_log.push(WorkflowAuditEntry {
3085        id: new_id("audit"),
3086        op: op.to_string(),
3087        node_id,
3088        timestamp: now_rfc3339(),
3089        reason,
3090        metadata,
3091    });
3092}
3093
3094pub fn builtin_ceiling() -> CapabilityPolicy {
3095    CapabilityPolicy {
3096        // Runtime-owned ceiling is capability-based, not product-tool-based.
3097        // Integrators define concrete tool surfaces in workflow graphs / registries.
3098        tools: Vec::new(),
3099        capabilities: BTreeMap::from([
3100            (
3101                "workspace".to_string(),
3102                vec![
3103                    "read_text".to_string(),
3104                    "write_text".to_string(),
3105                    "apply_edit".to_string(),
3106                    "delete".to_string(),
3107                    "exists".to_string(),
3108                    "list".to_string(),
3109                ],
3110            ),
3111            ("process".to_string(), vec!["exec".to_string()]),
3112        ]),
3113        workspace_roots: Vec::new(),
3114        side_effect_level: Some("network".to_string()),
3115        recursion_limit: Some(8),
3116        tool_arg_constraints: Vec::new(),
3117        tool_metadata: BTreeMap::new(),
3118    }
3119}
3120
3121#[cfg(test)]
3122mod tests {
3123    use super::*;
3124
3125    #[test]
3126    fn capability_intersection_rejects_privilege_expansion() {
3127        let ceiling = CapabilityPolicy {
3128            tools: vec!["read".to_string()],
3129            side_effect_level: Some("read_only".to_string()),
3130            recursion_limit: Some(2),
3131            ..Default::default()
3132        };
3133        let requested = CapabilityPolicy {
3134            tools: vec!["read".to_string(), "edit".to_string()],
3135            ..Default::default()
3136        };
3137        let error = ceiling.intersect(&requested).unwrap_err();
3138        assert!(error.contains("host ceiling"));
3139    }
3140
3141    #[test]
3142    fn mutation_session_normalize_fills_defaults() {
3143        let normalized = MutationSessionRecord::default().normalize();
3144        assert!(normalized.session_id.starts_with("session_"));
3145        assert_eq!(normalized.mutation_scope, "read_only");
3146        assert_eq!(normalized.approval_mode, "host_enforced");
3147    }
3148
3149    #[test]
3150    fn install_current_mutation_session_round_trips() {
3151        install_current_mutation_session(Some(MutationSessionRecord {
3152            session_id: "session_test".to_string(),
3153            mutation_scope: "apply_workspace".to_string(),
3154            approval_mode: "explicit".to_string(),
3155            ..Default::default()
3156        }));
3157        let current = current_mutation_session().expect("session installed");
3158        assert_eq!(current.session_id, "session_test");
3159        assert_eq!(current.mutation_scope, "apply_workspace");
3160        assert_eq!(current.approval_mode, "explicit");
3161
3162        install_current_mutation_session(None);
3163        assert!(current_mutation_session().is_none());
3164    }
3165
3166    #[test]
3167    fn active_execution_policy_rejects_unknown_bridge_builtin() {
3168        push_execution_policy(CapabilityPolicy {
3169            tools: vec!["read".to_string()],
3170            capabilities: BTreeMap::from([(
3171                "workspace".to_string(),
3172                vec!["read_text".to_string()],
3173            )]),
3174            side_effect_level: Some("read_only".to_string()),
3175            recursion_limit: Some(1),
3176            ..Default::default()
3177        });
3178        let error = enforce_current_policy_for_bridge_builtin("custom_host_builtin").unwrap_err();
3179        pop_execution_policy();
3180        assert!(matches!(
3181            error,
3182            VmError::CategorizedError {
3183                category: crate::value::ErrorCategory::ToolRejected,
3184                ..
3185            }
3186        ));
3187    }
3188
3189    #[test]
3190    fn active_execution_policy_rejects_mcp_escape_hatch() {
3191        push_execution_policy(CapabilityPolicy {
3192            tools: vec!["read".to_string()],
3193            capabilities: BTreeMap::from([(
3194                "workspace".to_string(),
3195                vec!["read_text".to_string()],
3196            )]),
3197            side_effect_level: Some("read_only".to_string()),
3198            recursion_limit: Some(1),
3199            ..Default::default()
3200        });
3201        let error = enforce_current_policy_for_builtin("mcp_connect", &[]).unwrap_err();
3202        pop_execution_policy();
3203        assert!(matches!(
3204            error,
3205            VmError::CategorizedError {
3206                category: crate::value::ErrorCategory::ToolRejected,
3207                ..
3208            }
3209        ));
3210    }
3211
3212    #[test]
3213    fn workflow_normalization_upgrades_legacy_act_verify_repair_shape() {
3214        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3215            "name": "legacy",
3216            "act": {"mode": "llm"},
3217            "verify": {"kind": "verify"},
3218            "repair": {"mode": "agent"},
3219        }));
3220        let graph = normalize_workflow_value(&value).unwrap();
3221        assert_eq!(graph.type_name, "workflow_graph");
3222        assert!(graph.nodes.contains_key("act"));
3223        assert!(graph.nodes.contains_key("verify"));
3224        assert!(graph.nodes.contains_key("repair"));
3225        assert_eq!(graph.entry, "act");
3226    }
3227
3228    #[test]
3229    fn workflow_normalization_accepts_tool_registry_nodes() {
3230        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3231            "name": "registry_tools",
3232            "entry": "implement",
3233            "nodes": {
3234                "implement": {
3235                    "kind": "stage",
3236                    "mode": "agent",
3237                    "tools": {
3238                        "_type": "tool_registry",
3239                        "tools": [
3240                            {"name": "read", "description": "Read files"},
3241                            {"name": "run", "description": "Run commands"}
3242                        ]
3243                    }
3244                }
3245            },
3246            "edges": []
3247        }));
3248        let graph = normalize_workflow_value(&value).unwrap();
3249        let node = graph.nodes.get("implement").unwrap();
3250        assert_eq!(workflow_tool_names(&node.tools), vec!["read", "run"]);
3251    }
3252
3253    #[test]
3254    fn artifact_selection_honors_budget_and_priority() {
3255        let policy = ContextPolicy {
3256            max_artifacts: Some(2),
3257            max_tokens: Some(30),
3258            prefer_recent: true,
3259            prefer_fresh: true,
3260            prioritize_kinds: vec!["verification_result".to_string()],
3261            ..Default::default()
3262        };
3263        let artifacts = vec![
3264            ArtifactRecord {
3265                type_name: "artifact".to_string(),
3266                id: "a".to_string(),
3267                kind: "summary".to_string(),
3268                text: Some("short".to_string()),
3269                relevance: Some(0.9),
3270                created_at: now_rfc3339(),
3271                ..Default::default()
3272            }
3273            .normalize(),
3274            ArtifactRecord {
3275                type_name: "artifact".to_string(),
3276                id: "b".to_string(),
3277                kind: "summary".to_string(),
3278                text: Some("this is a much larger artifact body".to_string()),
3279                relevance: Some(1.0),
3280                created_at: now_rfc3339(),
3281                ..Default::default()
3282            }
3283            .normalize(),
3284            ArtifactRecord {
3285                type_name: "artifact".to_string(),
3286                id: "c".to_string(),
3287                kind: "summary".to_string(),
3288                text: Some("tiny".to_string()),
3289                relevance: Some(0.5),
3290                created_at: now_rfc3339(),
3291                ..Default::default()
3292            }
3293            .normalize(),
3294        ];
3295        let selected = select_artifacts(artifacts, &policy);
3296        assert_eq!(selected.len(), 2);
3297        assert!(selected.iter().all(|artifact| artifact.kind == "summary"));
3298    }
3299
3300    #[test]
3301    fn workflow_validation_rejects_condition_without_true_false_edges() {
3302        let graph = WorkflowGraph {
3303            entry: "gate".to_string(),
3304            nodes: BTreeMap::from([(
3305                "gate".to_string(),
3306                WorkflowNode {
3307                    id: Some("gate".to_string()),
3308                    kind: "condition".to_string(),
3309                    ..Default::default()
3310                },
3311            )]),
3312            edges: vec![WorkflowEdge {
3313                from: "gate".to_string(),
3314                to: "next".to_string(),
3315                branch: Some("true".to_string()),
3316                label: None,
3317            }],
3318            ..Default::default()
3319        };
3320        let report = validate_workflow(&graph, None);
3321        assert!(!report.valid);
3322        assert!(report
3323            .errors
3324            .iter()
3325            .any(|error| error.contains("true") && error.contains("false")));
3326    }
3327
3328    #[test]
3329    fn replay_fixture_round_trip_passes() {
3330        let run = RunRecord {
3331            type_name: "run_record".to_string(),
3332            id: "run_1".to_string(),
3333            workflow_id: "wf".to_string(),
3334            workflow_name: Some("demo".to_string()),
3335            task: "demo".to_string(),
3336            status: "completed".to_string(),
3337            started_at: "1".to_string(),
3338            finished_at: Some("2".to_string()),
3339            parent_run_id: None,
3340            root_run_id: Some("run_1".to_string()),
3341            stages: vec![RunStageRecord {
3342                id: "stage_1".to_string(),
3343                node_id: "act".to_string(),
3344                kind: "stage".to_string(),
3345                status: "completed".to_string(),
3346                outcome: "success".to_string(),
3347                branch: Some("success".to_string()),
3348                started_at: "1".to_string(),
3349                finished_at: Some("2".to_string()),
3350                visible_text: Some("done".to_string()),
3351                private_reasoning: None,
3352                transcript: None,
3353                verification: None,
3354                usage: None,
3355                artifacts: vec![ArtifactRecord {
3356                    type_name: "artifact".to_string(),
3357                    id: "a1".to_string(),
3358                    kind: "summary".to_string(),
3359                    text: Some("done".to_string()),
3360                    created_at: "1".to_string(),
3361                    ..Default::default()
3362                }
3363                .normalize()],
3364                consumed_artifact_ids: vec![],
3365                produced_artifact_ids: vec!["a1".to_string()],
3366                attempts: vec![],
3367                metadata: BTreeMap::new(),
3368            }],
3369            transitions: vec![],
3370            checkpoints: vec![],
3371            pending_nodes: vec![],
3372            completed_nodes: vec!["act".to_string()],
3373            child_runs: vec![],
3374            artifacts: vec![],
3375            policy: CapabilityPolicy::default(),
3376            execution: None,
3377            transcript: None,
3378            usage: None,
3379            replay_fixture: None,
3380            trace_spans: vec![],
3381            metadata: BTreeMap::new(),
3382            persisted_path: None,
3383        };
3384        let fixture = replay_fixture_from_run(&run);
3385        let report = evaluate_run_against_fixture(&run, &fixture);
3386        assert!(report.pass);
3387        assert!(report.failures.is_empty());
3388    }
3389
3390    #[test]
3391    fn replay_eval_suite_reports_failed_case() {
3392        let good = RunRecord {
3393            id: "run_good".to_string(),
3394            workflow_id: "wf".to_string(),
3395            status: "completed".to_string(),
3396            stages: vec![RunStageRecord {
3397                node_id: "act".to_string(),
3398                status: "completed".to_string(),
3399                outcome: "success".to_string(),
3400                ..Default::default()
3401            }],
3402            ..Default::default()
3403        };
3404        let bad = RunRecord {
3405            id: "run_bad".to_string(),
3406            workflow_id: "wf".to_string(),
3407            status: "failed".to_string(),
3408            stages: vec![RunStageRecord {
3409                node_id: "act".to_string(),
3410                status: "failed".to_string(),
3411                outcome: "error".to_string(),
3412                ..Default::default()
3413            }],
3414            ..Default::default()
3415        };
3416        let suite = evaluate_run_suite(vec![
3417            (
3418                good.clone(),
3419                replay_fixture_from_run(&good),
3420                Some("good.json".to_string()),
3421            ),
3422            (
3423                bad.clone(),
3424                replay_fixture_from_run(&good),
3425                Some("bad.json".to_string()),
3426            ),
3427        ]);
3428        assert!(!suite.pass);
3429        assert_eq!(suite.total, 2);
3430        assert_eq!(suite.failed, 1);
3431        assert!(suite.cases.iter().any(|case| !case.pass));
3432    }
3433
3434    #[test]
3435    fn run_diff_reports_changed_stage() {
3436        let left = RunRecord {
3437            id: "left".to_string(),
3438            workflow_id: "wf".to_string(),
3439            status: "completed".to_string(),
3440            stages: vec![RunStageRecord {
3441                node_id: "act".to_string(),
3442                status: "completed".to_string(),
3443                outcome: "success".to_string(),
3444                ..Default::default()
3445            }],
3446            ..Default::default()
3447        };
3448        let right = RunRecord {
3449            id: "right".to_string(),
3450            workflow_id: "wf".to_string(),
3451            status: "failed".to_string(),
3452            stages: vec![RunStageRecord {
3453                node_id: "act".to_string(),
3454                status: "failed".to_string(),
3455                outcome: "error".to_string(),
3456                ..Default::default()
3457            }],
3458            ..Default::default()
3459        };
3460        let diff = diff_run_records(&left, &right);
3461        assert!(diff.status_changed);
3462        assert!(!diff.identical);
3463        assert_eq!(diff.stage_diffs.len(), 1);
3464    }
3465
3466    #[test]
3467    fn eval_suite_manifest_can_fail_on_baseline_diff() {
3468        let temp_dir =
3469            std::env::temp_dir().join(format!("harn-eval-suite-{}", uuid::Uuid::now_v7()));
3470        std::fs::create_dir_all(&temp_dir).unwrap();
3471        let baseline_path = temp_dir.join("baseline.json");
3472        let candidate_path = temp_dir.join("candidate.json");
3473
3474        let baseline = RunRecord {
3475            id: "baseline".to_string(),
3476            workflow_id: "wf".to_string(),
3477            status: "completed".to_string(),
3478            stages: vec![RunStageRecord {
3479                node_id: "act".to_string(),
3480                status: "completed".to_string(),
3481                outcome: "success".to_string(),
3482                ..Default::default()
3483            }],
3484            ..Default::default()
3485        };
3486        let candidate = RunRecord {
3487            id: "candidate".to_string(),
3488            workflow_id: "wf".to_string(),
3489            status: "failed".to_string(),
3490            stages: vec![RunStageRecord {
3491                node_id: "act".to_string(),
3492                status: "failed".to_string(),
3493                outcome: "error".to_string(),
3494                ..Default::default()
3495            }],
3496            ..Default::default()
3497        };
3498
3499        save_run_record(&baseline, Some(baseline_path.to_str().unwrap())).unwrap();
3500        save_run_record(&candidate, Some(candidate_path.to_str().unwrap())).unwrap();
3501
3502        let manifest = EvalSuiteManifest {
3503            base_dir: Some(temp_dir.display().to_string()),
3504            cases: vec![EvalSuiteCase {
3505                label: Some("candidate".to_string()),
3506                run_path: "candidate.json".to_string(),
3507                fixture_path: None,
3508                compare_to: Some("baseline.json".to_string()),
3509            }],
3510            ..Default::default()
3511        };
3512        let suite = evaluate_run_suite_manifest(&manifest).unwrap();
3513        assert!(!suite.pass);
3514        assert_eq!(suite.failed, 1);
3515        assert!(suite.cases[0].comparison.is_some());
3516        assert!(suite.cases[0]
3517            .failures
3518            .iter()
3519            .any(|failure| failure.contains("baseline")));
3520    }
3521
3522    #[test]
3523    fn render_unified_diff_marks_removed_and_added_lines() {
3524        let diff = render_unified_diff(Some("src/main.rs"), "old\nsame", "new\nsame");
3525        assert!(diff.contains("--- a/src/main.rs"));
3526        assert!(diff.contains("+++ b/src/main.rs"));
3527        assert!(diff.contains("-old"));
3528        assert!(diff.contains("+new"));
3529        assert!(diff.contains(" same"));
3530    }
3531
3532    #[test]
3533    fn execution_policy_rejects_process_exec_when_read_only() {
3534        push_execution_policy(CapabilityPolicy {
3535            side_effect_level: Some("read_only".to_string()),
3536            capabilities: BTreeMap::from([("process".to_string(), vec!["exec".to_string()])]),
3537            ..Default::default()
3538        });
3539        let result = enforce_current_policy_for_builtin("exec", &[]);
3540        pop_execution_policy();
3541        assert!(result.is_err());
3542    }
3543
3544    #[test]
3545    fn execution_policy_rejects_unlisted_tool() {
3546        push_execution_policy(CapabilityPolicy {
3547            tools: vec!["read".to_string()],
3548            ..Default::default()
3549        });
3550        let result = enforce_current_policy_for_tool("edit");
3551        pop_execution_policy();
3552        assert!(result.is_err());
3553    }
3554
3555    #[test]
3556    fn normalize_run_record_preserves_trace_spans() {
3557        let value = crate::stdlib::json_to_vm_value(&serde_json::json!({
3558            "_type": "run_record",
3559            "id": "run_trace",
3560            "workflow_id": "wf",
3561            "status": "completed",
3562            "started_at": "1",
3563            "trace_spans": [
3564                {
3565                    "span_id": 1,
3566                    "parent_id": null,
3567                    "kind": "pipeline",
3568                    "name": "workflow",
3569                    "start_ms": 0,
3570                    "duration_ms": 42,
3571                    "metadata": {"model": "demo"}
3572                }
3573            ]
3574        }));
3575
3576        let run = normalize_run_record(&value).unwrap();
3577        assert_eq!(run.trace_spans.len(), 1);
3578        assert_eq!(run.trace_spans[0].kind, "pipeline");
3579        assert_eq!(
3580            run.trace_spans[0].metadata["model"],
3581            serde_json::json!("demo")
3582        );
3583    }
3584
3585    // ── Tool hook tests ──────────────────────────────────────────────
3586
3587    #[test]
3588    fn pre_tool_hook_deny_blocks_execution() {
3589        clear_tool_hooks();
3590        register_tool_hook(ToolHook {
3591            pattern: "dangerous_*".to_string(),
3592            pre: Some(Rc::new(|_name, _args| {
3593                PreToolAction::Deny("blocked by policy".to_string())
3594            })),
3595            post: None,
3596        });
3597        let result = run_pre_tool_hooks("dangerous_delete", &serde_json::json!({}));
3598        clear_tool_hooks();
3599        assert!(matches!(result, PreToolAction::Deny(_)));
3600    }
3601
3602    #[test]
3603    fn pre_tool_hook_allow_passes_through() {
3604        clear_tool_hooks();
3605        register_tool_hook(ToolHook {
3606            pattern: "safe_*".to_string(),
3607            pre: Some(Rc::new(|_name, _args| PreToolAction::Allow)),
3608            post: None,
3609        });
3610        let result = run_pre_tool_hooks("safe_read", &serde_json::json!({}));
3611        clear_tool_hooks();
3612        assert!(matches!(result, PreToolAction::Allow));
3613    }
3614
3615    #[test]
3616    fn pre_tool_hook_modify_rewrites_args() {
3617        clear_tool_hooks();
3618        register_tool_hook(ToolHook {
3619            pattern: "*".to_string(),
3620            pre: Some(Rc::new(|_name, _args| {
3621                PreToolAction::Modify(serde_json::json!({"path": "/sanitized"}))
3622            })),
3623            post: None,
3624        });
3625        let result = run_pre_tool_hooks("read_file", &serde_json::json!({"path": "/etc/passwd"}));
3626        clear_tool_hooks();
3627        match result {
3628            PreToolAction::Modify(args) => assert_eq!(args["path"], "/sanitized"),
3629            _ => panic!("expected Modify"),
3630        }
3631    }
3632
3633    #[test]
3634    fn post_tool_hook_modifies_result() {
3635        clear_tool_hooks();
3636        register_tool_hook(ToolHook {
3637            pattern: "exec".to_string(),
3638            pre: None,
3639            post: Some(Rc::new(|_name, result| {
3640                if result.contains("SECRET") {
3641                    PostToolAction::Modify("[REDACTED]".to_string())
3642                } else {
3643                    PostToolAction::Pass
3644                }
3645            })),
3646        });
3647        let result = run_post_tool_hooks("exec", "output with SECRET data");
3648        let clean = run_post_tool_hooks("exec", "clean output");
3649        clear_tool_hooks();
3650        assert_eq!(result, "[REDACTED]");
3651        assert_eq!(clean, "clean output");
3652    }
3653
3654    #[test]
3655    fn unmatched_hook_pattern_does_not_fire() {
3656        clear_tool_hooks();
3657        register_tool_hook(ToolHook {
3658            pattern: "exec".to_string(),
3659            pre: Some(Rc::new(|_name, _args| {
3660                PreToolAction::Deny("should not match".to_string())
3661            })),
3662            post: None,
3663        });
3664        let result = run_pre_tool_hooks("read_file", &serde_json::json!({}));
3665        clear_tool_hooks();
3666        assert!(matches!(result, PreToolAction::Allow));
3667    }
3668
3669    #[test]
3670    fn glob_match_patterns() {
3671        assert!(glob_match("*", "anything"));
3672        assert!(glob_match("exec*", "exec_at"));
3673        assert!(glob_match("*_file", "read_file"));
3674        assert!(!glob_match("exec*", "read_file"));
3675        assert!(glob_match("read_file", "read_file"));
3676        assert!(!glob_match("read_file", "write_file"));
3677    }
3678
3679    // ── Auto-compaction tests ────────────────────────────────────────
3680
3681    #[test]
3682    fn microcompact_snips_large_output() {
3683        let large = "x".repeat(50_000);
3684        let result = microcompact_tool_output(&large, 10_000);
3685        assert!(result.len() < 15_000);
3686        assert!(result.contains("snipped"));
3687    }
3688
3689    #[test]
3690    fn microcompact_preserves_small_output() {
3691        let small = "hello world";
3692        let result = microcompact_tool_output(small, 10_000);
3693        assert_eq!(result, small);
3694    }
3695
3696    #[test]
3697    fn microcompact_preserves_strong_keyword_lines_without_file_line() {
3698        // Regression: diagnostic extraction used to require both a
3699        // file:line reference AND a keyword. Strong keywords like "FAIL"
3700        // and "panic" should preserve the line on their own, because they
3701        // carry signal even when they appear on narrative lines (Go's
3702        // "--- FAIL: TestName", Rust's "thread '...' panicked at ...",
3703        // pytest's "FAILED tests/..."). The exact patterns are language-
3704        // specific and don't belong in the VM — but the generic rule
3705        // "strong keywords count even without file:line" does.
3706        let mut output = String::new();
3707        for i in 0..100 {
3708            output.push_str(&format!("verbose progress line {i}\n"));
3709        }
3710        output.push_str("--- FAIL: TestEmpty (0.00s)\n");
3711        output.push_str("thread 'tests::test_foo' panicked at src/lib.rs:42:5\n");
3712        output.push_str("FAILED tests/test_parser.py::test_empty\n");
3713        for i in 0..100 {
3714            output.push_str(&format!("more output after failures {i}\n"));
3715        }
3716        let result = microcompact_tool_output(&output, 2_000);
3717        assert!(
3718            result.contains("--- FAIL: TestEmpty"),
3719            "strong 'FAIL' keyword should preserve the line:\n{result}"
3720        );
3721        assert!(
3722            result.contains("panicked at"),
3723            "strong 'panic' keyword should preserve the line:\n{result}"
3724        );
3725        assert!(
3726            result.contains("FAILED tests/test_parser.py"),
3727            "strong 'FAIL' keyword should preserve pytest-style lines too:\n{result}"
3728        );
3729    }
3730
3731    #[test]
3732    fn auto_compact_messages_reduces_count() {
3733        let mut messages: Vec<serde_json::Value> = (0..20)
3734            .map(|i| serde_json::json!({"role": "user", "content": format!("message {i}")}))
3735            .collect();
3736        let runtime = tokio::runtime::Builder::new_current_thread()
3737            .enable_all()
3738            .build()
3739            .unwrap();
3740        let compacted = runtime.block_on(auto_compact_messages(
3741            &mut messages,
3742            &AutoCompactConfig {
3743                compact_strategy: CompactStrategy::Truncate,
3744                keep_last: 6,
3745                ..Default::default()
3746            },
3747            None,
3748        ));
3749        let summary = compacted.unwrap();
3750        assert!(summary.is_some());
3751        assert!(messages.len() <= 7); // 6 kept + 1 summary
3752        assert!(messages[0]["content"]
3753            .as_str()
3754            .unwrap()
3755            .contains("auto-compacted"));
3756    }
3757
3758    #[test]
3759    fn auto_compact_noop_when_under_threshold() {
3760        let mut messages: Vec<serde_json::Value> = (0..4)
3761            .map(|i| serde_json::json!({"role": "user", "content": format!("msg {i}")}))
3762            .collect();
3763        let runtime = tokio::runtime::Builder::new_current_thread()
3764            .enable_all()
3765            .build()
3766            .unwrap();
3767        let compacted = runtime.block_on(auto_compact_messages(
3768            &mut messages,
3769            &AutoCompactConfig {
3770                compact_strategy: CompactStrategy::Truncate,
3771                keep_last: 6,
3772                ..Default::default()
3773            },
3774            None,
3775        ));
3776        assert!(compacted.unwrap().is_none());
3777        assert_eq!(messages.len(), 4);
3778    }
3779
3780    #[test]
3781    fn estimate_message_tokens_basic() {
3782        let messages = vec![
3783            serde_json::json!({"role": "user", "content": "a".repeat(400)}),
3784            serde_json::json!({"role": "assistant", "content": "b".repeat(400)}),
3785        ];
3786        let tokens = estimate_message_tokens(&messages);
3787        assert_eq!(tokens, 200); // 800 chars / 4
3788    }
3789
3790    // ── Artifact dedup and microcompaction tests ─────────────────────
3791
3792    #[test]
3793    fn dedup_artifacts_removes_duplicates() {
3794        let mut artifacts = vec![
3795            ArtifactRecord {
3796                id: "a1".to_string(),
3797                kind: "test".to_string(),
3798                text: Some("duplicate content".to_string()),
3799                ..Default::default()
3800            },
3801            ArtifactRecord {
3802                id: "a2".to_string(),
3803                kind: "test".to_string(),
3804                text: Some("duplicate content".to_string()),
3805                ..Default::default()
3806            },
3807            ArtifactRecord {
3808                id: "a3".to_string(),
3809                kind: "test".to_string(),
3810                text: Some("unique content".to_string()),
3811                ..Default::default()
3812            },
3813        ];
3814        dedup_artifacts(&mut artifacts);
3815        assert_eq!(artifacts.len(), 2);
3816    }
3817
3818    #[test]
3819    fn microcompact_artifact_snips_oversized() {
3820        let mut artifact = ArtifactRecord {
3821            id: "a1".to_string(),
3822            kind: "test".to_string(),
3823            text: Some("x".repeat(10_000)),
3824            estimated_tokens: Some(2_500),
3825            ..Default::default()
3826        };
3827        microcompact_artifact(&mut artifact, 500);
3828        assert!(artifact.text.as_ref().unwrap().len() < 5_000);
3829        assert_eq!(artifact.estimated_tokens, Some(500));
3830    }
3831
3832    // ── Tool argument constraint tests ───────────────────────────────
3833
3834    #[test]
3835    fn arg_constraint_allows_matching_pattern() {
3836        let policy = CapabilityPolicy {
3837            tool_arg_constraints: vec![ToolArgConstraint {
3838                tool: "exec".to_string(),
3839                arg_patterns: vec!["cargo *".to_string()],
3840            }],
3841            ..Default::default()
3842        };
3843        let result = enforce_tool_arg_constraints(
3844            &policy,
3845            "exec",
3846            &serde_json::json!({"command": "cargo test"}),
3847        );
3848        assert!(result.is_ok());
3849    }
3850
3851    #[test]
3852    fn arg_constraint_rejects_non_matching_pattern() {
3853        let policy = CapabilityPolicy {
3854            tool_arg_constraints: vec![ToolArgConstraint {
3855                tool: "exec".to_string(),
3856                arg_patterns: vec!["cargo *".to_string()],
3857            }],
3858            ..Default::default()
3859        };
3860        let result = enforce_tool_arg_constraints(
3861            &policy,
3862            "exec",
3863            &serde_json::json!({"command": "rm -rf /"}),
3864        );
3865        assert!(result.is_err());
3866    }
3867
3868    #[test]
3869    fn arg_constraint_ignores_unmatched_tool() {
3870        let policy = CapabilityPolicy {
3871            tool_arg_constraints: vec![ToolArgConstraint {
3872                tool: "exec".to_string(),
3873                arg_patterns: vec!["cargo *".to_string()],
3874            }],
3875            ..Default::default()
3876        };
3877        let result = enforce_tool_arg_constraints(
3878            &policy,
3879            "read_file",
3880            &serde_json::json!({"path": "/etc/passwd"}),
3881        );
3882        assert!(result.is_ok());
3883    }
3884
3885    #[test]
3886    fn microcompact_handles_multibyte_utf8() {
3887        // Emoji are 4 bytes each — slicing at arbitrary byte offsets would panic
3888        let emoji_output = "🔥".repeat(500); // 2000 bytes, 500 chars
3889        let result = microcompact_tool_output(&emoji_output, 400);
3890        // Should not panic and should contain the snip marker
3891        assert!(result.contains("snipped"));
3892
3893        // Mixed ASCII + multi-byte
3894        let mixed = format!("{}{}{}", "a".repeat(300), "é".repeat(500), "b".repeat(300));
3895        let result2 = microcompact_tool_output(&mixed, 400);
3896        assert!(result2.contains("snipped"));
3897
3898        // CJK characters (3 bytes each)
3899        let cjk = "中文".repeat(500);
3900        let result3 = microcompact_tool_output(&cjk, 400);
3901        assert!(result3.contains("snipped"));
3902    }
3903}
harn_vm/orchestration.rs

harn_vm/
orchestration.rs