Skip to main content

harn_vm/orchestration/
artifacts.rs

1//! Artifact types, normalization, selection, and context rendering.
2
3use std::collections::{BTreeMap, BTreeSet};
4
5use serde::{Deserialize, Serialize};
6
7use crate::stdlib::harn_entry::{call_harn_export_json, call_harn_export_typed};
8use crate::stdlib::xml::escape_xml_text;
9
10use super::{
11    handoff_artifact_record, handoff_from_json_value, microcompact_tool_output, new_id,
12    normalize_handoff_artifact_json, now_rfc3339, ContextPolicy, StageContract,
13    VerificationContract,
14};
15
16/// Snip an artifact's text to fit within a token budget.
17pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
18    let max_chars = max_tokens * 4;
19    if let Some(ref text) = artifact.text {
20        if text.len() > max_chars && max_chars >= 200 {
21            artifact.text = Some(microcompact_tool_output(text, max_chars));
22            artifact.estimated_tokens = Some(max_tokens);
23        }
24    }
25}
26
27/// Deduplicate artifacts by removing those with identical text content,
28/// keeping the one with higher priority.
29pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
30    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
31    artifacts.retain(|artifact| {
32        let text = artifact.text.as_deref().unwrap_or("");
33        if text.is_empty() {
34            return true;
35        }
36        let hash = {
37            use std::hash::{Hash, Hasher};
38            let mut hasher = std::collections::hash_map::DefaultHasher::new();
39            text.hash(&mut hasher);
40            hasher.finish()
41        };
42        seen_hashes.insert(hash)
43    });
44}
45
46/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
47/// then delegate to the standard `select_artifacts`.
48pub fn select_artifacts_adaptive(
49    mut artifacts: Vec<ArtifactRecord>,
50    policy: &ContextPolicy,
51) -> Vec<ArtifactRecord> {
52    drop_stale_evidence_artifacts(&mut artifacts);
53    dedup_artifacts(&mut artifacts);
54
55    // Cap individual artifacts to a fraction of the total budget, with a 500-token
56    // floor but never exceeding the total (so a single artifact can't overrun).
57    if let Some(max_tokens) = policy.max_tokens {
58        let count = artifacts.len().max(1);
59        let per_artifact_budget = max_tokens / count;
60        let cap = per_artifact_budget.max(500).min(max_tokens);
61        for artifact in &mut artifacts {
62            let est = artifact.estimated_tokens.unwrap_or(0);
63            if est > cap * 2 {
64                microcompact_artifact(artifact, cap);
65            }
66        }
67    }
68
69    select_artifacts(artifacts, policy)
70}
71
72fn metadata_string_list(artifact: &ArtifactRecord, key: &str) -> Vec<String> {
73    artifact
74        .metadata
75        .get(key)
76        .and_then(|value| value.as_array())
77        .map(|items| {
78            items
79                .iter()
80                .filter_map(|item| item.as_str())
81                .map(str::trim)
82                .filter(|value| !value.is_empty())
83                .map(ToOwned::to_owned)
84                .collect::<Vec<_>>()
85        })
86        .unwrap_or_default()
87}
88
89fn drop_stale_evidence_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
90    let fresh_changed_paths: BTreeSet<String> = artifacts
91        .iter()
92        .filter(|artifact| freshness_rank(artifact.freshness.as_deref()) >= 2)
93        .flat_map(|artifact| metadata_string_list(artifact, "changed_paths"))
94        .collect();
95    if fresh_changed_paths.is_empty() {
96        return;
97    }
98
99    artifacts.retain(|artifact| {
100        let evidence_paths = metadata_string_list(artifact, "evidence_paths");
101        if evidence_paths.is_empty() {
102            return true;
103        }
104        if freshness_rank(artifact.freshness.as_deref()) >= 2 {
105            return true;
106        }
107        !evidence_paths
108            .iter()
109            .any(|path| fresh_changed_paths.contains(path))
110    });
111}
112
113fn normalize_artifact_kind(kind: &str) -> String {
114    match kind {
115        "resource"
116        | "handoff"
117        | "workspace_file"
118        | "editor_selection"
119        | "workspace_snapshot"
120        | "transcript_summary"
121        | "summary"
122        | "plan"
123        | "diff"
124        | "git_diff"
125        | "patch"
126        | "patch_set"
127        | "patch_proposal"
128        | "diff_review"
129        | "review_decision"
130        | "verification_bundle"
131        | "apply_intent"
132        | "verification_result"
133        | "test_result"
134        | "command_result"
135        | "provider_payload"
136        | "worker_result"
137        | "worker_notification"
138        | "artifact" => kind.to_string(),
139        "file" => "workspace_file".to_string(),
140        "transcript" => "transcript_summary".to_string(),
141        "verification" => "verification_result".to_string(),
142        "test" => "test_result".to_string(),
143        other if other.trim().is_empty() => "artifact".to_string(),
144        other => other.to_string(),
145    }
146}
147
148fn default_artifact_priority(kind: &str) -> i64 {
149    match kind {
150        "verification_result" | "test_result" => 100,
151        "verification_bundle" => 95,
152        "handoff" => 92,
153        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
154        | "review_decision" | "apply_intent" => 90,
155        "plan" => 80,
156        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
157        "summary" | "transcript_summary" => 60,
158        "command_result" => 50,
159        _ => 40,
160    }
161}
162
163fn freshness_rank(value: Option<&str>) -> i64 {
164    match value.unwrap_or_default() {
165        "fresh" | "live" => 3,
166        "recent" => 2,
167        "stale" => 0,
168        _ => 1,
169    }
170}
171
172#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
173#[serde(default)]
174pub struct ArtifactRecord {
175    #[serde(rename = "_type")]
176    pub type_name: String,
177    pub id: String,
178    pub kind: String,
179    pub title: Option<String>,
180    pub text: Option<String>,
181    pub data: Option<serde_json::Value>,
182    pub source: Option<String>,
183    pub created_at: String,
184    pub freshness: Option<String>,
185    pub priority: Option<i64>,
186    pub lineage: Vec<String>,
187    pub relevance: Option<f64>,
188    pub estimated_tokens: Option<usize>,
189    pub stage: Option<String>,
190    pub metadata: BTreeMap<String, serde_json::Value>,
191}
192
193impl ArtifactRecord {
194    /// Apply the unified redaction policy in place. Hosts that opt into
195    /// redacted artifact persistence call this before writing to a run
196    /// record; the policy strips known secret patterns from `text` and
197    /// scrubs auth-shaped fields nested under `data` and `metadata`.
198    pub fn redact_in_place(&mut self, policy: &crate::redact::RedactionPolicy) {
199        if let Some(text) = self.text.as_mut() {
200            let scrubbed = policy.redact_string(text);
201            if let std::borrow::Cow::Owned(replacement) = scrubbed {
202                *text = replacement;
203            }
204        }
205        if let Some(data) = self.data.as_mut() {
206            policy.redact_json_in_place(data);
207        }
208        for (key, value) in self.metadata.iter_mut() {
209            if policy.field_is_sensitive(key) {
210                *value = serde_json::Value::String(crate::redact::REDACTED_PLACEHOLDER.to_string());
211            } else {
212                policy.redact_json_in_place(value);
213            }
214        }
215    }
216
217    pub fn normalize(mut self) -> Self {
218        if self.type_name.is_empty() {
219            self.type_name = "artifact".to_string();
220        }
221        if self.id.is_empty() {
222            self.id = new_id("artifact");
223        }
224        if self.created_at.is_empty() {
225            self.created_at = now_rfc3339();
226        }
227        if self.kind.is_empty() {
228            self.kind = "artifact".to_string();
229        }
230        self.kind = normalize_artifact_kind(&self.kind);
231        if self.estimated_tokens.is_none() {
232            self.estimated_tokens = self
233                .text
234                .as_ref()
235                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
236        }
237        if self.priority.is_none() {
238            self.priority = Some(default_artifact_priority(&self.kind));
239        }
240        self
241    }
242}
243
244pub fn select_artifacts(
245    mut artifacts: Vec<ArtifactRecord>,
246    policy: &ContextPolicy,
247) -> Vec<ArtifactRecord> {
248    artifacts.retain(|artifact| {
249        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
250            && !policy.exclude_kinds.contains(&artifact.kind)
251            && (policy.include_stages.is_empty()
252                || artifact
253                    .stage
254                    .as_ref()
255                    .is_some_and(|stage| policy.include_stages.contains(stage)))
256    });
257    artifacts.sort_by(|a, b| {
258        let b_pinned = policy.pinned_ids.contains(&b.id);
259        let a_pinned = policy.pinned_ids.contains(&a.id);
260        b_pinned
261            .cmp(&a_pinned)
262            .then_with(|| {
263                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
264                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
265                b_prio_kind.cmp(&a_prio_kind)
266            })
267            .then_with(|| {
268                b.priority
269                    .unwrap_or_default()
270                    .cmp(&a.priority.unwrap_or_default())
271            })
272            .then_with(|| {
273                if policy.prefer_fresh {
274                    freshness_rank(b.freshness.as_deref())
275                        .cmp(&freshness_rank(a.freshness.as_deref()))
276                } else {
277                    std::cmp::Ordering::Equal
278                }
279            })
280            .then_with(|| {
281                if policy.prefer_recent {
282                    b.created_at.cmp(&a.created_at)
283                } else {
284                    std::cmp::Ordering::Equal
285                }
286            })
287            .then_with(|| {
288                b.relevance
289                    .partial_cmp(&a.relevance)
290                    .unwrap_or(std::cmp::Ordering::Equal)
291            })
292            .then_with(|| {
293                a.estimated_tokens
294                    .unwrap_or(usize::MAX)
295                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
296            })
297    });
298
299    let mut selected = Vec::new();
300    let mut used_tokens = 0usize;
301    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
302    let effective_max_tokens = policy
303        .max_tokens
304        .map(|max| max.saturating_sub(reserve_tokens));
305    for artifact in artifacts {
306        if let Some(max_artifacts) = policy.max_artifacts {
307            if selected.len() >= max_artifacts {
308                break;
309            }
310        }
311        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
312        if let Some(max_tokens) = effective_max_tokens {
313            if used_tokens + next_tokens > max_tokens {
314                continue;
315            }
316        }
317        used_tokens += next_tokens;
318        selected.push(artifact);
319    }
320    selected
321}
322
323pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
324    let mut parts = Vec::new();
325    for artifact in artifacts {
326        let title = artifact
327            .title
328            .clone()
329            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
330        let body = artifact
331            .text
332            .clone()
333            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
334            .unwrap_or_default();
335        match policy.render.as_deref() {
336            Some("json") => {
337                parts.push(
338                    serde_json::json!({
339                        "id": artifact.id,
340                        "kind": artifact.kind,
341                        "title": title,
342                        "source": artifact.source,
343                        "freshness": artifact.freshness,
344                        "priority": artifact.priority,
345                        "text": body,
346                    })
347                    .to_string(),
348                );
349            }
350            _ => parts.push(format!(
351                "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
352<freshness>{}</freshness>\n<priority>{}</priority>\n<body>\n{}\n</body>\n</artifact>",
353                escape_xml_text(&title),
354                escape_xml_text(&artifact.kind),
355                escape_xml_text(
356                    artifact
357                        .source
358                        .clone()
359                        .unwrap_or_else(|| "unknown".to_string())
360                        .as_str(),
361                ),
362                escape_xml_text(
363                    artifact
364                        .freshness
365                        .clone()
366                        .unwrap_or_else(|| "normal".to_string())
367                        .as_str(),
368                ),
369                artifact.priority.unwrap_or_default(),
370                body
371            )),
372        }
373    }
374    parts.join("\n\n")
375}
376
377#[derive(Clone, Debug, Deserialize, PartialEq)]
378pub struct SelectedWorkflowStageArtifacts {
379    pub artifacts: Vec<ArtifactRecord>,
380    pub context_policy: ContextPolicy,
381}
382
383pub async fn select_workflow_stage_artifacts(
384    ctx: &crate::vm::AsyncBuiltinCtx,
385    artifacts: &[ArtifactRecord],
386    context_policy: &ContextPolicy,
387    input_contract: &StageContract,
388) -> Result<SelectedWorkflowStageArtifacts, crate::value::VmError> {
389    let payload = serde_json::json!({
390        "artifacts": artifacts,
391        "context_policy": context_policy,
392        "input_contract": input_contract,
393    });
394    let mut selected: SelectedWorkflowStageArtifacts = call_harn_export_typed(
395        ctx,
396        "std/workflow/context",
397        "workflow_select_stage_artifacts",
398        "workflow_select_stage_artifacts",
399        payload,
400    )
401    .await?;
402    selected.artifacts = selected
403        .artifacts
404        .into_iter()
405        .map(ArtifactRecord::normalize)
406        .collect();
407    Ok(selected)
408}
409
410#[derive(Clone, Debug, PartialEq, Eq)]
411pub struct PreparedWorkflowStagePrompt {
412    pub prompt: String,
413    pub rendered_context: String,
414    pub rendered_verification: String,
415}
416
417pub async fn prepare_workflow_stage_prompt(
418    ctx: &crate::vm::AsyncBuiltinCtx,
419    task: &str,
420    task_label: Option<&str>,
421    artifacts: &[ArtifactRecord],
422    context_policy: &ContextPolicy,
423    rendered_context: Option<&str>,
424    verification_contracts: &[VerificationContract],
425) -> Result<PreparedWorkflowStagePrompt, crate::value::VmError> {
426    let payload = serde_json::json!({
427        "task": task,
428        "task_label": task_label,
429        "artifacts": artifacts,
430        "context_policy": context_policy,
431        "rendered_context": rendered_context,
432        "verification_contracts": verification_contracts,
433    });
434    let prepared = call_harn_export_json(
435        ctx,
436        "std/workflow/prompts",
437        "workflow_prepare_stage_prompt",
438        "workflow_prepare_stage_prompt",
439        payload,
440    )
441    .await?;
442    let prepared = prepared.as_object().ok_or_else(|| {
443        crate::value::VmError::Runtime(
444            "workflow_prepare_stage_prompt must return a dict".to_string(),
445        )
446    })?;
447    Ok(PreparedWorkflowStagePrompt {
448        prompt: workflow_prompt_string_field(prepared, "prompt")?,
449        rendered_context: workflow_prompt_string_field(prepared, "rendered_context")?,
450        rendered_verification: workflow_prompt_string_field(prepared, "rendered_verification")?,
451    })
452}
453
454fn workflow_prompt_string_field(
455    value: &serde_json::Map<String, serde_json::Value>,
456    field: &str,
457) -> Result<String, crate::value::VmError> {
458    value
459        .get(field)
460        .and_then(serde_json::Value::as_str)
461        .map(ToOwned::to_owned)
462        .ok_or_else(|| {
463            crate::value::VmError::Runtime(format!(
464                "workflow_prepare_stage_prompt must return string field '{field}'"
465            ))
466        })
467}
468
469pub fn normalize_artifact(
470    value: &crate::value::VmValue,
471) -> Result<ArtifactRecord, crate::value::VmError> {
472    let artifact: ArtifactRecord = super::parse_json_value(value)?;
473    let artifact = artifact.normalize();
474    if artifact.kind == "handoff" {
475        let json = serde_json::to_value(&artifact).map_err(|error| {
476            crate::value::VmError::Runtime(format!("artifact handoff encode error: {error}"))
477        })?;
478        let handoff = handoff_from_json_value(&json)
479            .or_else(|| {
480                artifact
481                    .data
482                    .as_ref()
483                    .and_then(|data| normalize_handoff_artifact_json(data.clone()).ok())
484            })
485            .ok_or_else(|| {
486                crate::value::VmError::Runtime(
487                    "artifact handoff data must contain a valid handoff payload".to_string(),
488                )
489            })?;
490        return Ok(handoff_artifact_record(&handoff, Some(&artifact)));
491    }
492    Ok(artifact)
493}