Skip to main content

harn_vm/orchestration/
artifacts.rs

1//! Artifact types, normalization, selection, and context rendering.
2
3use std::collections::{BTreeMap, BTreeSet};
4
5use serde::{Deserialize, Serialize};
6
7use crate::stdlib::harn_entry::{call_harn_export_json, call_harn_export_typed};
8
9use super::{
10    handoff_artifact_record, handoff_from_json_value, microcompact_tool_output, new_id,
11    normalize_handoff_artifact_json, now_rfc3339, ContextPolicy, StageContract,
12    VerificationContract,
13};
14
15/// Snip an artifact's text to fit within a token budget.
16pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
17    let max_chars = max_tokens * 4;
18    if let Some(ref text) = artifact.text {
19        if text.len() > max_chars && max_chars >= 200 {
20            artifact.text = Some(microcompact_tool_output(text, max_chars));
21            artifact.estimated_tokens = Some(max_tokens);
22        }
23    }
24}
25
26/// Deduplicate artifacts by removing those with identical text content,
27/// keeping the one with higher priority.
28pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
29    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
30    artifacts.retain(|artifact| {
31        let text = artifact.text.as_deref().unwrap_or("");
32        if text.is_empty() {
33            return true;
34        }
35        let hash = {
36            use std::hash::{Hash, Hasher};
37            let mut hasher = std::collections::hash_map::DefaultHasher::new();
38            text.hash(&mut hasher);
39            hasher.finish()
40        };
41        seen_hashes.insert(hash)
42    });
43}
44
45/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
46/// then delegate to the standard `select_artifacts`.
47pub fn select_artifacts_adaptive(
48    mut artifacts: Vec<ArtifactRecord>,
49    policy: &ContextPolicy,
50) -> Vec<ArtifactRecord> {
51    drop_stale_evidence_artifacts(&mut artifacts);
52    dedup_artifacts(&mut artifacts);
53
54    // Cap individual artifacts to a fraction of the total budget, with a 500-token
55    // floor but never exceeding the total (so a single artifact can't overrun).
56    if let Some(max_tokens) = policy.max_tokens {
57        let count = artifacts.len().max(1);
58        let per_artifact_budget = max_tokens / count;
59        let cap = per_artifact_budget.max(500).min(max_tokens);
60        for artifact in &mut artifacts {
61            let est = artifact.estimated_tokens.unwrap_or(0);
62            if est > cap * 2 {
63                microcompact_artifact(artifact, cap);
64            }
65        }
66    }
67
68    select_artifacts(artifacts, policy)
69}
70
71fn metadata_string_list(artifact: &ArtifactRecord, key: &str) -> Vec<String> {
72    artifact
73        .metadata
74        .get(key)
75        .and_then(|value| value.as_array())
76        .map(|items| {
77            items
78                .iter()
79                .filter_map(|item| item.as_str())
80                .map(str::trim)
81                .filter(|value| !value.is_empty())
82                .map(ToOwned::to_owned)
83                .collect::<Vec<_>>()
84        })
85        .unwrap_or_default()
86}
87
88fn drop_stale_evidence_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
89    let fresh_changed_paths: BTreeSet<String> = artifacts
90        .iter()
91        .filter(|artifact| freshness_rank(artifact.freshness.as_deref()) >= 2)
92        .flat_map(|artifact| metadata_string_list(artifact, "changed_paths"))
93        .collect();
94    if fresh_changed_paths.is_empty() {
95        return;
96    }
97
98    artifacts.retain(|artifact| {
99        let evidence_paths = metadata_string_list(artifact, "evidence_paths");
100        if evidence_paths.is_empty() {
101            return true;
102        }
103        if freshness_rank(artifact.freshness.as_deref()) >= 2 {
104            return true;
105        }
106        !evidence_paths
107            .iter()
108            .any(|path| fresh_changed_paths.contains(path))
109    });
110}
111
112fn normalize_artifact_kind(kind: &str) -> String {
113    match kind {
114        "resource"
115        | "handoff"
116        | "workspace_file"
117        | "editor_selection"
118        | "workspace_snapshot"
119        | "transcript_summary"
120        | "summary"
121        | "plan"
122        | "diff"
123        | "git_diff"
124        | "patch"
125        | "patch_set"
126        | "patch_proposal"
127        | "diff_review"
128        | "review_decision"
129        | "verification_bundle"
130        | "apply_intent"
131        | "verification_result"
132        | "test_result"
133        | "command_result"
134        | "provider_payload"
135        | "worker_result"
136        | "worker_notification"
137        | "artifact" => kind.to_string(),
138        "file" => "workspace_file".to_string(),
139        "transcript" => "transcript_summary".to_string(),
140        "verification" => "verification_result".to_string(),
141        "test" => "test_result".to_string(),
142        other if other.trim().is_empty() => "artifact".to_string(),
143        other => other.to_string(),
144    }
145}
146
147fn default_artifact_priority(kind: &str) -> i64 {
148    match kind {
149        "verification_result" | "test_result" => 100,
150        "verification_bundle" => 95,
151        "handoff" => 92,
152        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
153        | "review_decision" | "apply_intent" => 90,
154        "plan" => 80,
155        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
156        "summary" | "transcript_summary" => 60,
157        "command_result" => 50,
158        _ => 40,
159    }
160}
161
162fn freshness_rank(value: Option<&str>) -> i64 {
163    match value.unwrap_or_default() {
164        "fresh" | "live" => 3,
165        "recent" => 2,
166        "stale" => 0,
167        _ => 1,
168    }
169}
170
171#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
172#[serde(default)]
173pub struct ArtifactRecord {
174    #[serde(rename = "_type")]
175    pub type_name: String,
176    pub id: String,
177    pub kind: String,
178    pub title: Option<String>,
179    pub text: Option<String>,
180    pub data: Option<serde_json::Value>,
181    pub source: Option<String>,
182    pub created_at: String,
183    pub freshness: Option<String>,
184    pub priority: Option<i64>,
185    pub lineage: Vec<String>,
186    pub relevance: Option<f64>,
187    pub estimated_tokens: Option<usize>,
188    pub stage: Option<String>,
189    pub metadata: BTreeMap<String, serde_json::Value>,
190}
191
192impl ArtifactRecord {
193    /// Apply the unified redaction policy in place. Hosts that opt into
194    /// redacted artifact persistence call this before writing to a run
195    /// record; the policy strips known secret patterns from `text` and
196    /// scrubs auth-shaped fields nested under `data` and `metadata`.
197    pub fn redact_in_place(&mut self, policy: &crate::redact::RedactionPolicy) {
198        if let Some(text) = self.text.as_mut() {
199            let scrubbed = policy.redact_string(text);
200            if let std::borrow::Cow::Owned(replacement) = scrubbed {
201                *text = replacement;
202            }
203        }
204        if let Some(data) = self.data.as_mut() {
205            policy.redact_json_in_place(data);
206        }
207        for (key, value) in self.metadata.iter_mut() {
208            if policy.field_is_sensitive(key) {
209                *value = serde_json::Value::String(crate::redact::REDACTED_PLACEHOLDER.to_string());
210            } else {
211                policy.redact_json_in_place(value);
212            }
213        }
214    }
215
216    pub fn normalize(mut self) -> Self {
217        if self.type_name.is_empty() {
218            self.type_name = "artifact".to_string();
219        }
220        if self.id.is_empty() {
221            self.id = new_id("artifact");
222        }
223        if self.created_at.is_empty() {
224            self.created_at = now_rfc3339();
225        }
226        if self.kind.is_empty() {
227            self.kind = "artifact".to_string();
228        }
229        self.kind = normalize_artifact_kind(&self.kind);
230        if self.estimated_tokens.is_none() {
231            self.estimated_tokens = self
232                .text
233                .as_ref()
234                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
235        }
236        if self.priority.is_none() {
237            self.priority = Some(default_artifact_priority(&self.kind));
238        }
239        self
240    }
241}
242
243pub fn select_artifacts(
244    mut artifacts: Vec<ArtifactRecord>,
245    policy: &ContextPolicy,
246) -> Vec<ArtifactRecord> {
247    artifacts.retain(|artifact| {
248        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
249            && !policy.exclude_kinds.contains(&artifact.kind)
250            && (policy.include_stages.is_empty()
251                || artifact
252                    .stage
253                    .as_ref()
254                    .is_some_and(|stage| policy.include_stages.contains(stage)))
255    });
256    artifacts.sort_by(|a, b| {
257        let b_pinned = policy.pinned_ids.contains(&b.id);
258        let a_pinned = policy.pinned_ids.contains(&a.id);
259        b_pinned
260            .cmp(&a_pinned)
261            .then_with(|| {
262                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
263                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
264                b_prio_kind.cmp(&a_prio_kind)
265            })
266            .then_with(|| {
267                b.priority
268                    .unwrap_or_default()
269                    .cmp(&a.priority.unwrap_or_default())
270            })
271            .then_with(|| {
272                if policy.prefer_fresh {
273                    freshness_rank(b.freshness.as_deref())
274                        .cmp(&freshness_rank(a.freshness.as_deref()))
275                } else {
276                    std::cmp::Ordering::Equal
277                }
278            })
279            .then_with(|| {
280                if policy.prefer_recent {
281                    b.created_at.cmp(&a.created_at)
282                } else {
283                    std::cmp::Ordering::Equal
284                }
285            })
286            .then_with(|| {
287                b.relevance
288                    .partial_cmp(&a.relevance)
289                    .unwrap_or(std::cmp::Ordering::Equal)
290            })
291            .then_with(|| {
292                a.estimated_tokens
293                    .unwrap_or(usize::MAX)
294                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
295            })
296    });
297
298    let mut selected = Vec::new();
299    let mut used_tokens = 0usize;
300    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
301    let effective_max_tokens = policy
302        .max_tokens
303        .map(|max| max.saturating_sub(reserve_tokens));
304    for artifact in artifacts {
305        if let Some(max_artifacts) = policy.max_artifacts {
306            if selected.len() >= max_artifacts {
307                break;
308            }
309        }
310        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
311        if let Some(max_tokens) = effective_max_tokens {
312            if used_tokens + next_tokens > max_tokens {
313                continue;
314            }
315        }
316        used_tokens += next_tokens;
317        selected.push(artifact);
318    }
319    selected
320}
321
322pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
323    let mut parts = Vec::new();
324    for artifact in artifacts {
325        let title = artifact
326            .title
327            .clone()
328            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
329        let body = artifact
330            .text
331            .clone()
332            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
333            .unwrap_or_default();
334        match policy.render.as_deref() {
335            Some("json") => {
336                parts.push(
337                    serde_json::json!({
338                        "id": artifact.id,
339                        "kind": artifact.kind,
340                        "title": title,
341                        "source": artifact.source,
342                        "freshness": artifact.freshness,
343                        "priority": artifact.priority,
344                        "text": body,
345                    })
346                    .to_string(),
347                );
348            }
349            _ => parts.push(format!(
350                "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
351<freshness>{}</freshness>\n<priority>{}</priority>\n<body>\n{}\n</body>\n</artifact>",
352                escape_prompt_text(&title),
353                escape_prompt_text(&artifact.kind),
354                escape_prompt_text(
355                    artifact
356                        .source
357                        .clone()
358                        .unwrap_or_else(|| "unknown".to_string())
359                        .as_str(),
360                ),
361                escape_prompt_text(
362                    artifact
363                        .freshness
364                        .clone()
365                        .unwrap_or_else(|| "normal".to_string())
366                        .as_str(),
367                ),
368                artifact.priority.unwrap_or_default(),
369                body
370            )),
371        }
372    }
373    parts.join("\n\n")
374}
375
376#[derive(Clone, Debug, Deserialize, PartialEq)]
377pub struct SelectedWorkflowStageArtifacts {
378    pub artifacts: Vec<ArtifactRecord>,
379    pub context_policy: ContextPolicy,
380}
381
382pub async fn select_workflow_stage_artifacts(
383    artifacts: &[ArtifactRecord],
384    context_policy: &ContextPolicy,
385    input_contract: &StageContract,
386) -> Result<SelectedWorkflowStageArtifacts, crate::value::VmError> {
387    let payload = serde_json::json!({
388        "artifacts": artifacts,
389        "context_policy": context_policy,
390        "input_contract": input_contract,
391    });
392    let mut selected: SelectedWorkflowStageArtifacts = call_harn_export_typed(
393        "std/workflow/context",
394        "workflow_select_stage_artifacts",
395        "workflow_select_stage_artifacts",
396        payload,
397    )
398    .await?;
399    selected.artifacts = selected
400        .artifacts
401        .into_iter()
402        .map(ArtifactRecord::normalize)
403        .collect();
404    Ok(selected)
405}
406
407#[derive(Clone, Debug, PartialEq, Eq)]
408pub struct PreparedWorkflowStagePrompt {
409    pub prompt: String,
410    pub rendered_context: String,
411    pub rendered_verification: String,
412}
413
414pub async fn prepare_workflow_stage_prompt(
415    task: &str,
416    task_label: Option<&str>,
417    artifacts: &[ArtifactRecord],
418    context_policy: &ContextPolicy,
419    rendered_context: Option<&str>,
420    verification_contracts: &[VerificationContract],
421) -> Result<PreparedWorkflowStagePrompt, crate::value::VmError> {
422    let payload = serde_json::json!({
423        "task": task,
424        "task_label": task_label,
425        "artifacts": artifacts,
426        "context_policy": context_policy,
427        "rendered_context": rendered_context,
428        "verification_contracts": verification_contracts,
429    });
430    let prepared = call_harn_export_json(
431        "std/workflow/prompts",
432        "workflow_prepare_stage_prompt",
433        "workflow_prepare_stage_prompt",
434        payload,
435    )
436    .await?;
437    let prepared = prepared.as_object().ok_or_else(|| {
438        crate::value::VmError::Runtime(
439            "workflow_prepare_stage_prompt must return a dict".to_string(),
440        )
441    })?;
442    Ok(PreparedWorkflowStagePrompt {
443        prompt: workflow_prompt_string_field(prepared, "prompt")?,
444        rendered_context: workflow_prompt_string_field(prepared, "rendered_context")?,
445        rendered_verification: workflow_prompt_string_field(prepared, "rendered_verification")?,
446    })
447}
448
449fn workflow_prompt_string_field(
450    value: &serde_json::Map<String, serde_json::Value>,
451    field: &str,
452) -> Result<String, crate::value::VmError> {
453    value
454        .get(field)
455        .and_then(serde_json::Value::as_str)
456        .map(ToOwned::to_owned)
457        .ok_or_else(|| {
458            crate::value::VmError::Runtime(format!(
459                "workflow_prepare_stage_prompt must return string field '{field}'"
460            ))
461        })
462}
463
464fn escape_prompt_text(text: &str) -> String {
465    text.replace('&', "&amp;")
466        .replace('<', "&lt;")
467        .replace('>', "&gt;")
468}
469
470pub fn normalize_artifact(
471    value: &crate::value::VmValue,
472) -> Result<ArtifactRecord, crate::value::VmError> {
473    let artifact: ArtifactRecord = super::parse_json_value(value)?;
474    let artifact = artifact.normalize();
475    if artifact.kind == "handoff" {
476        let json = serde_json::to_value(&artifact).map_err(|error| {
477            crate::value::VmError::Runtime(format!("artifact handoff encode error: {error}"))
478        })?;
479        let handoff = handoff_from_json_value(&json)
480            .or_else(|| {
481                artifact
482                    .data
483                    .as_ref()
484                    .and_then(|data| normalize_handoff_artifact_json(data.clone()).ok())
485            })
486            .ok_or_else(|| {
487                crate::value::VmError::Runtime(
488                    "artifact handoff data must contain a valid handoff payload".to_string(),
489                )
490            })?;
491        return Ok(handoff_artifact_record(&handoff, Some(&artifact)));
492    }
493    Ok(artifact)
494}