Skip to main content

harn_vm/orchestration/
artifacts.rs

1//! Artifact types, normalization, selection, and context rendering.
2
3use std::collections::{BTreeMap, BTreeSet};
4
5use serde::{Deserialize, Serialize};
6
7use crate::stdlib::harn_entry::{call_harn_export_json, call_harn_export_typed};
8
9use super::{
10    handoff_artifact_record, handoff_from_json_value, microcompact_tool_output, new_id,
11    normalize_handoff_artifact_json, now_rfc3339, ContextPolicy, StageContract,
12    VerificationContract,
13};
14
15/// Snip an artifact's text to fit within a token budget.
16pub fn microcompact_artifact(artifact: &mut ArtifactRecord, max_tokens: usize) {
17    let max_chars = max_tokens * 4;
18    if let Some(ref text) = artifact.text {
19        if text.len() > max_chars && max_chars >= 200 {
20            artifact.text = Some(microcompact_tool_output(text, max_chars));
21            artifact.estimated_tokens = Some(max_tokens);
22        }
23    }
24}
25
26/// Deduplicate artifacts by removing those with identical text content,
27/// keeping the one with higher priority.
28pub fn dedup_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
29    let mut seen_hashes: BTreeSet<u64> = BTreeSet::new();
30    artifacts.retain(|artifact| {
31        let text = artifact.text.as_deref().unwrap_or("");
32        if text.is_empty() {
33            return true;
34        }
35        let hash = {
36            use std::hash::{Hash, Hasher};
37            let mut hasher = std::collections::hash_map::DefaultHasher::new();
38            text.hash(&mut hasher);
39            hasher.finish()
40        };
41        seen_hashes.insert(hash)
42    });
43}
44
45/// Enhanced artifact selection: dedup, microcompact oversized artifacts,
46/// then delegate to the standard `select_artifacts`.
47pub fn select_artifacts_adaptive(
48    mut artifacts: Vec<ArtifactRecord>,
49    policy: &ContextPolicy,
50) -> Vec<ArtifactRecord> {
51    drop_stale_evidence_artifacts(&mut artifacts);
52    dedup_artifacts(&mut artifacts);
53
54    // Cap individual artifacts to a fraction of the total budget, with a 500-token
55    // floor but never exceeding the total (so a single artifact can't overrun).
56    if let Some(max_tokens) = policy.max_tokens {
57        let count = artifacts.len().max(1);
58        let per_artifact_budget = max_tokens / count;
59        let cap = per_artifact_budget.max(500).min(max_tokens);
60        for artifact in &mut artifacts {
61            let est = artifact.estimated_tokens.unwrap_or(0);
62            if est > cap * 2 {
63                microcompact_artifact(artifact, cap);
64            }
65        }
66    }
67
68    select_artifacts(artifacts, policy)
69}
70
71fn metadata_string_list(artifact: &ArtifactRecord, key: &str) -> Vec<String> {
72    artifact
73        .metadata
74        .get(key)
75        .and_then(|value| value.as_array())
76        .map(|items| {
77            items
78                .iter()
79                .filter_map(|item| item.as_str())
80                .map(str::trim)
81                .filter(|value| !value.is_empty())
82                .map(ToOwned::to_owned)
83                .collect::<Vec<_>>()
84        })
85        .unwrap_or_default()
86}
87
88fn drop_stale_evidence_artifacts(artifacts: &mut Vec<ArtifactRecord>) {
89    let fresh_changed_paths: BTreeSet<String> = artifacts
90        .iter()
91        .filter(|artifact| freshness_rank(artifact.freshness.as_deref()) >= 2)
92        .flat_map(|artifact| metadata_string_list(artifact, "changed_paths"))
93        .collect();
94    if fresh_changed_paths.is_empty() {
95        return;
96    }
97
98    artifacts.retain(|artifact| {
99        let evidence_paths = metadata_string_list(artifact, "evidence_paths");
100        if evidence_paths.is_empty() {
101            return true;
102        }
103        if freshness_rank(artifact.freshness.as_deref()) >= 2 {
104            return true;
105        }
106        !evidence_paths
107            .iter()
108            .any(|path| fresh_changed_paths.contains(path))
109    });
110}
111
112fn normalize_artifact_kind(kind: &str) -> String {
113    match kind {
114        "resource"
115        | "handoff"
116        | "workspace_file"
117        | "editor_selection"
118        | "workspace_snapshot"
119        | "transcript_summary"
120        | "summary"
121        | "plan"
122        | "diff"
123        | "git_diff"
124        | "patch"
125        | "patch_set"
126        | "patch_proposal"
127        | "diff_review"
128        | "review_decision"
129        | "verification_bundle"
130        | "apply_intent"
131        | "verification_result"
132        | "test_result"
133        | "command_result"
134        | "provider_payload"
135        | "worker_result"
136        | "worker_notification"
137        | "artifact" => kind.to_string(),
138        "file" => "workspace_file".to_string(),
139        "transcript" => "transcript_summary".to_string(),
140        "verification" => "verification_result".to_string(),
141        "test" => "test_result".to_string(),
142        other if other.trim().is_empty() => "artifact".to_string(),
143        other => other.to_string(),
144    }
145}
146
147fn default_artifact_priority(kind: &str) -> i64 {
148    match kind {
149        "verification_result" | "test_result" => 100,
150        "verification_bundle" => 95,
151        "handoff" => 92,
152        "diff" | "git_diff" | "patch" | "patch_set" | "patch_proposal" | "diff_review"
153        | "review_decision" | "apply_intent" => 90,
154        "plan" => 80,
155        "workspace_file" | "workspace_snapshot" | "editor_selection" | "resource" => 70,
156        "summary" | "transcript_summary" => 60,
157        "command_result" => 50,
158        _ => 40,
159    }
160}
161
162fn freshness_rank(value: Option<&str>) -> i64 {
163    match value.unwrap_or_default() {
164        "fresh" | "live" => 3,
165        "recent" => 2,
166        "stale" => 0,
167        _ => 1,
168    }
169}
170
171#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
172#[serde(default)]
173pub struct ArtifactRecord {
174    #[serde(rename = "_type")]
175    pub type_name: String,
176    pub id: String,
177    pub kind: String,
178    pub title: Option<String>,
179    pub text: Option<String>,
180    pub data: Option<serde_json::Value>,
181    pub source: Option<String>,
182    pub created_at: String,
183    pub freshness: Option<String>,
184    pub priority: Option<i64>,
185    pub lineage: Vec<String>,
186    pub relevance: Option<f64>,
187    pub estimated_tokens: Option<usize>,
188    pub stage: Option<String>,
189    pub metadata: BTreeMap<String, serde_json::Value>,
190}
191
192impl ArtifactRecord {
193    pub fn normalize(mut self) -> Self {
194        if self.type_name.is_empty() {
195            self.type_name = "artifact".to_string();
196        }
197        if self.id.is_empty() {
198            self.id = new_id("artifact");
199        }
200        if self.created_at.is_empty() {
201            self.created_at = now_rfc3339();
202        }
203        if self.kind.is_empty() {
204            self.kind = "artifact".to_string();
205        }
206        self.kind = normalize_artifact_kind(&self.kind);
207        if self.estimated_tokens.is_none() {
208            self.estimated_tokens = self
209                .text
210                .as_ref()
211                .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
212        }
213        if self.priority.is_none() {
214            self.priority = Some(default_artifact_priority(&self.kind));
215        }
216        self
217    }
218}
219
220pub fn select_artifacts(
221    mut artifacts: Vec<ArtifactRecord>,
222    policy: &ContextPolicy,
223) -> Vec<ArtifactRecord> {
224    artifacts.retain(|artifact| {
225        (policy.include_kinds.is_empty() || policy.include_kinds.contains(&artifact.kind))
226            && !policy.exclude_kinds.contains(&artifact.kind)
227            && (policy.include_stages.is_empty()
228                || artifact
229                    .stage
230                    .as_ref()
231                    .is_some_and(|stage| policy.include_stages.contains(stage)))
232    });
233    artifacts.sort_by(|a, b| {
234        let b_pinned = policy.pinned_ids.contains(&b.id);
235        let a_pinned = policy.pinned_ids.contains(&a.id);
236        b_pinned
237            .cmp(&a_pinned)
238            .then_with(|| {
239                let b_prio_kind = policy.prioritize_kinds.contains(&b.kind);
240                let a_prio_kind = policy.prioritize_kinds.contains(&a.kind);
241                b_prio_kind.cmp(&a_prio_kind)
242            })
243            .then_with(|| {
244                b.priority
245                    .unwrap_or_default()
246                    .cmp(&a.priority.unwrap_or_default())
247            })
248            .then_with(|| {
249                if policy.prefer_fresh {
250                    freshness_rank(b.freshness.as_deref())
251                        .cmp(&freshness_rank(a.freshness.as_deref()))
252                } else {
253                    std::cmp::Ordering::Equal
254                }
255            })
256            .then_with(|| {
257                if policy.prefer_recent {
258                    b.created_at.cmp(&a.created_at)
259                } else {
260                    std::cmp::Ordering::Equal
261                }
262            })
263            .then_with(|| {
264                b.relevance
265                    .partial_cmp(&a.relevance)
266                    .unwrap_or(std::cmp::Ordering::Equal)
267            })
268            .then_with(|| {
269                a.estimated_tokens
270                    .unwrap_or(usize::MAX)
271                    .cmp(&b.estimated_tokens.unwrap_or(usize::MAX))
272            })
273    });
274
275    let mut selected = Vec::new();
276    let mut used_tokens = 0usize;
277    let reserve_tokens = policy.reserve_tokens.unwrap_or(0);
278    let effective_max_tokens = policy
279        .max_tokens
280        .map(|max| max.saturating_sub(reserve_tokens));
281    for artifact in artifacts {
282        if let Some(max_artifacts) = policy.max_artifacts {
283            if selected.len() >= max_artifacts {
284                break;
285            }
286        }
287        let next_tokens = artifact.estimated_tokens.unwrap_or(0);
288        if let Some(max_tokens) = effective_max_tokens {
289            if used_tokens + next_tokens > max_tokens {
290                continue;
291            }
292        }
293        used_tokens += next_tokens;
294        selected.push(artifact);
295    }
296    selected
297}
298
299pub fn render_artifacts_context(artifacts: &[ArtifactRecord], policy: &ContextPolicy) -> String {
300    let mut parts = Vec::new();
301    for artifact in artifacts {
302        let title = artifact
303            .title
304            .clone()
305            .unwrap_or_else(|| format!("{} {}", artifact.kind, artifact.id));
306        let body = artifact
307            .text
308            .clone()
309            .or_else(|| artifact.data.as_ref().map(|v| v.to_string()))
310            .unwrap_or_default();
311        match policy.render.as_deref() {
312            Some("json") => {
313                parts.push(
314                    serde_json::json!({
315                        "id": artifact.id,
316                        "kind": artifact.kind,
317                        "title": title,
318                        "source": artifact.source,
319                        "freshness": artifact.freshness,
320                        "priority": artifact.priority,
321                        "text": body,
322                    })
323                    .to_string(),
324                );
325            }
326            _ => parts.push(format!(
327                "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
328<freshness>{}</freshness>\n<priority>{}</priority>\n<body>\n{}\n</body>\n</artifact>",
329                escape_prompt_text(&title),
330                escape_prompt_text(&artifact.kind),
331                escape_prompt_text(
332                    artifact
333                        .source
334                        .clone()
335                        .unwrap_or_else(|| "unknown".to_string())
336                        .as_str(),
337                ),
338                escape_prompt_text(
339                    artifact
340                        .freshness
341                        .clone()
342                        .unwrap_or_else(|| "normal".to_string())
343                        .as_str(),
344                ),
345                artifact.priority.unwrap_or_default(),
346                body
347            )),
348        }
349    }
350    parts.join("\n\n")
351}
352
353#[derive(Clone, Debug, Deserialize, PartialEq)]
354pub struct SelectedWorkflowStageArtifacts {
355    pub artifacts: Vec<ArtifactRecord>,
356    pub context_policy: ContextPolicy,
357}
358
359pub async fn select_workflow_stage_artifacts(
360    artifacts: &[ArtifactRecord],
361    context_policy: &ContextPolicy,
362    input_contract: &StageContract,
363) -> Result<SelectedWorkflowStageArtifacts, crate::value::VmError> {
364    let payload = serde_json::json!({
365        "artifacts": artifacts,
366        "context_policy": context_policy,
367        "input_contract": input_contract,
368    });
369    let mut selected: SelectedWorkflowStageArtifacts = call_harn_export_typed(
370        "std/workflow/context",
371        "workflow_select_stage_artifacts",
372        "workflow_select_stage_artifacts",
373        payload,
374    )
375    .await?;
376    selected.artifacts = selected
377        .artifacts
378        .into_iter()
379        .map(ArtifactRecord::normalize)
380        .collect();
381    Ok(selected)
382}
383
384#[derive(Clone, Debug, PartialEq, Eq)]
385pub struct PreparedWorkflowStagePrompt {
386    pub prompt: String,
387    pub rendered_context: String,
388    pub rendered_verification: String,
389}
390
391pub async fn prepare_workflow_stage_prompt(
392    task: &str,
393    task_label: Option<&str>,
394    artifacts: &[ArtifactRecord],
395    context_policy: &ContextPolicy,
396    rendered_context: Option<&str>,
397    verification_contracts: &[VerificationContract],
398) -> Result<PreparedWorkflowStagePrompt, crate::value::VmError> {
399    let payload = serde_json::json!({
400        "task": task,
401        "task_label": task_label,
402        "artifacts": artifacts,
403        "context_policy": context_policy,
404        "rendered_context": rendered_context,
405        "verification_contracts": verification_contracts,
406    });
407    let prepared = call_harn_export_json(
408        "std/workflow/prompts",
409        "workflow_prepare_stage_prompt",
410        "workflow_prepare_stage_prompt",
411        payload,
412    )
413    .await?;
414    let prepared = prepared.as_object().ok_or_else(|| {
415        crate::value::VmError::Runtime(
416            "workflow_prepare_stage_prompt must return a dict".to_string(),
417        )
418    })?;
419    Ok(PreparedWorkflowStagePrompt {
420        prompt: workflow_prompt_string_field(prepared, "prompt")?,
421        rendered_context: workflow_prompt_string_field(prepared, "rendered_context")?,
422        rendered_verification: workflow_prompt_string_field(prepared, "rendered_verification")?,
423    })
424}
425
426fn workflow_prompt_string_field(
427    value: &serde_json::Map<String, serde_json::Value>,
428    field: &str,
429) -> Result<String, crate::value::VmError> {
430    value
431        .get(field)
432        .and_then(serde_json::Value::as_str)
433        .map(ToOwned::to_owned)
434        .ok_or_else(|| {
435            crate::value::VmError::Runtime(format!(
436                "workflow_prepare_stage_prompt must return string field '{field}'"
437            ))
438        })
439}
440
441fn escape_prompt_text(text: &str) -> String {
442    text.replace('&', "&amp;")
443        .replace('<', "&lt;")
444        .replace('>', "&gt;")
445}
446
447pub fn normalize_artifact(
448    value: &crate::value::VmValue,
449) -> Result<ArtifactRecord, crate::value::VmError> {
450    let artifact: ArtifactRecord = super::parse_json_value(value)?;
451    let artifact = artifact.normalize();
452    if artifact.kind == "handoff" {
453        let json = serde_json::to_value(&artifact).map_err(|error| {
454            crate::value::VmError::Runtime(format!("artifact handoff encode error: {error}"))
455        })?;
456        let handoff = handoff_from_json_value(&json)
457            .or_else(|| {
458                artifact
459                    .data
460                    .as_ref()
461                    .and_then(|data| normalize_handoff_artifact_json(data.clone()).ok())
462            })
463            .ok_or_else(|| {
464                crate::value::VmError::Runtime(
465                    "artifact handoff data must contain a valid handoff payload".to_string(),
466                )
467            })?;
468        return Ok(handoff_artifact_record(&handoff, Some(&artifact)));
469    }
470    Ok(artifact)
471}