1use std::collections::{BTreeMap, BTreeSet};
9use std::path::{Path, PathBuf};
10
11use serde::{Deserialize, Serialize};
12use serde_json::Value as JsonValue;
13use sha2::{Digest, Sha256};
14
15use super::{
16 assemble_context, estimate_chunk_tokens, render_assembled_chunks, ArtifactRecord,
17 AssembleDedup, AssembleOptions, AssembleStrategy,
18};
19use crate::value::VmError;
20
21pub const CONTEXT_EVAL_SCHEMA_VERSION: u32 = 1;
22pub const CONTEXT_EVAL_MANIFEST_TYPE: &str = "harn.context_eval.manifest.v1";
23pub const CONTEXT_EVAL_REPORT_TYPE: &str = "harn.context_eval.report.v1";
24
25#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
26#[serde(default)]
27pub struct ContextEvalManifest {
28 #[serde(rename = "_type")]
29 pub type_name: String,
30 pub version: u32,
31 pub id: String,
32 pub name: Option<String>,
33 pub description: Option<String>,
34 pub modes: Vec<ContextEvalMode>,
35 pub tasks: Vec<ContextEvalTask>,
36 pub metadata: BTreeMap<String, JsonValue>,
37}
38
39#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
40#[serde(default)]
41pub struct ContextEvalMode {
42 pub id: String,
43 pub name: Option<String>,
44 pub kind: String,
45 pub description: Option<String>,
46 #[serde(default, alias = "artifact-ids")]
47 pub artifact_ids: Vec<String>,
48 #[serde(default, alias = "include-artifact-kinds")]
49 pub include_artifact_kinds: Vec<String>,
50 #[serde(default, alias = "exclude-artifact-kinds")]
51 pub exclude_artifact_kinds: Vec<String>,
52 #[serde(default, alias = "budget-tokens")]
53 pub budget_tokens: Option<usize>,
54 #[serde(default, alias = "assemble-strategy")]
55 pub assemble_strategy: Option<String>,
56 pub dedup: Option<String>,
57 #[serde(default, alias = "microcompact-threshold")]
58 pub microcompact_threshold: Option<usize>,
59 #[serde(default, alias = "semantic-overlap")]
60 pub semantic_overlap: Option<f64>,
61 #[serde(default, alias = "projection-policy")]
62 pub projection_policy: Option<String>,
63 #[serde(default, alias = "transcript-keep-last")]
64 pub transcript_keep_last: Option<usize>,
65 #[serde(default, alias = "tool-disclosure")]
66 pub tool_disclosure: Option<String>,
67 #[serde(default, alias = "tool-allowlist")]
68 pub tool_allowlist: Vec<String>,
69 #[serde(default, alias = "expected-cache-hit")]
70 pub expected_cache_hit: Option<bool>,
71 #[serde(default, alias = "cache-namespace")]
72 pub cache_namespace: Option<String>,
73 #[serde(default, alias = "compaction-policy")]
74 pub compaction_policy: Option<JsonValue>,
75 pub preprocessing: Option<String>,
76 pub metadata: BTreeMap<String, JsonValue>,
77}
78
79#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
80#[serde(default)]
81pub struct ContextEvalTask {
82 pub id: String,
83 pub name: Option<String>,
84 pub objective: String,
85 #[serde(default, alias = "reference-answer")]
86 pub reference_answer: Option<String>,
87 pub artifacts: Vec<ArtifactRecord>,
88 pub transcript: Vec<ContextEvalTranscriptMessage>,
89 pub tools: Vec<ContextEvalTool>,
90 #[serde(default, alias = "tool-events")]
91 pub tool_events: Vec<ContextEvalToolEvent>,
92 pub expected: ContextEvalExpected,
93 pub observed: ContextEvalObserved,
94 #[serde(default, alias = "mode-observations")]
95 pub mode_observations: BTreeMap<String, ContextEvalObserved>,
96 pub metadata: BTreeMap<String, JsonValue>,
97}
98
99#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
100#[serde(default)]
101pub struct ContextEvalTranscriptMessage {
102 pub role: String,
103 pub content: String,
104 #[serde(default, alias = "estimated-tokens")]
105 pub estimated_tokens: Option<usize>,
106}
107
108#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
109#[serde(default)]
110pub struct ContextEvalTool {
111 pub name: String,
112 pub description: Option<String>,
113 pub capability: Option<String>,
114 pub deterministic: Option<bool>,
115}
116
117#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
118#[serde(default)]
119pub struct ContextEvalToolEvent {
120 pub order: Option<usize>,
121 pub name: String,
122 pub phase: Option<String>,
123 pub success: Option<bool>,
124 pub quality: Option<String>,
125 pub recovery: Option<bool>,
126}
127
128#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
129#[serde(default)]
130pub struct ContextEvalExpected {
131 #[serde(default, alias = "required-terms")]
132 pub required_terms: Vec<String>,
133 #[serde(default, alias = "expected-artifact-ids")]
134 pub expected_artifact_ids: Vec<String>,
135 #[serde(default, alias = "expected-tools")]
136 pub expected_tools: Vec<String>,
137 #[serde(default, alias = "max-input-tokens")]
138 pub max_input_tokens: Option<usize>,
139}
140
141#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
142#[serde(default)]
143pub struct ContextEvalObserved {
144 #[serde(default, alias = "final-response")]
145 pub final_response: Option<String>,
146 #[serde(default, alias = "latency-ms")]
147 pub latency_ms: Option<u64>,
148 #[serde(default, alias = "input-tokens")]
149 pub input_tokens: Option<usize>,
150 #[serde(default, alias = "output-tokens")]
151 pub output_tokens: Option<usize>,
152 #[serde(default, alias = "cost-usd")]
153 pub cost_usd: Option<f64>,
154 #[serde(default, alias = "cache-hit")]
155 pub cache_hit: Option<bool>,
156 #[serde(default, alias = "compaction-count")]
157 pub compaction_count: Option<usize>,
158 pub metadata: BTreeMap<String, JsonValue>,
159}
160
161#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
162#[serde(default)]
163pub struct ContextEvalReport {
164 #[serde(rename = "_type")]
165 pub type_name: String,
166 pub schema_version: u32,
167 pub manifest_id: String,
168 pub manifest_name: Option<String>,
169 pub pass: bool,
170 pub total_runs: usize,
171 pub passed_runs: usize,
172 pub failed_runs: usize,
173 pub total_tasks: usize,
174 pub total_modes: usize,
175 pub aggregate: ContextEvalAggregate,
176 pub modes: Vec<ContextEvalModeSummary>,
177 pub tasks: Vec<ContextEvalTaskSummary>,
178 pub runs: Vec<ContextEvalRunReport>,
179 pub metadata: BTreeMap<String, JsonValue>,
180}
181
182#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
183#[serde(default)]
184pub struct ContextEvalAggregate {
185 pub mean_final_correctness: f64,
186 pub mean_tool_call_quality: f64,
187 pub total_latency_ms: u64,
188 pub total_input_tokens: usize,
189 pub total_output_tokens: usize,
190 pub total_cost_usd: f64,
191 pub total_compaction_count: usize,
192 pub total_error_recovery_count: usize,
193}
194
195#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
196#[serde(default)]
197pub struct ContextEvalModeSummary {
198 pub id: String,
199 pub kind: String,
200 pub projection_policy: String,
201 pub tool_disclosure: String,
202 pub preprocessing: ContextEvalPreprocessing,
203}
204
205#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
206#[serde(default)]
207pub struct ContextEvalPreprocessing {
208 pub mode: String,
209 pub llm_enabled: bool,
210}
211
212#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
213#[serde(default)]
214pub struct ContextEvalTaskSummary {
215 pub id: String,
216 pub name: Option<String>,
217 pub required_terms: Vec<String>,
218 pub expected_artifact_ids: Vec<String>,
219 pub expected_tools: Vec<String>,
220}
221
222#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
223#[serde(default)]
224pub struct ContextEvalRunReport {
225 pub run_id: String,
226 pub task_id: String,
227 pub mode_id: String,
228 pub mode_kind: String,
229 pub status: String,
230 pub passed: bool,
231 pub final_correctness: ContextEvalCorrectness,
232 pub reads_before_first_edit: usize,
233 pub tool_call_quality: ContextEvalToolQuality,
234 pub latency_ms: u64,
235 pub input_tokens: usize,
236 pub output_tokens: usize,
237 pub cost_usd: f64,
238 pub compaction_count: usize,
239 pub projection: ContextEvalProjectionReport,
240 pub context: ContextEvalContextReport,
241 pub cache: ContextEvalCacheReport,
242 pub error_recovery_count: usize,
243 pub preprocessing: ContextEvalPreprocessing,
244 pub failures: Vec<String>,
245}
246
247#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
248#[serde(default)]
249pub struct ContextEvalCorrectness {
250 pub passed: bool,
251 pub score: f64,
252 pub required_terms_present: Vec<String>,
253 pub required_terms_missing: Vec<String>,
254 pub expected_artifact_ids_present: Vec<String>,
255 pub expected_artifact_ids_missing: Vec<String>,
256 pub source: String,
257}
258
259#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
260#[serde(default)]
261pub struct ContextEvalToolQuality {
262 pub score: f64,
263 pub expected_tools: Vec<String>,
264 pub observed_tools: Vec<String>,
265 pub matched_tools: Vec<String>,
266 pub missing_tools: Vec<String>,
267 pub unnecessary_tools: Vec<String>,
268}
269
270#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
271#[serde(default)]
272pub struct ContextEvalProjectionReport {
273 pub policy: String,
274 pub source_message_count: usize,
275 pub retained_message_count: usize,
276 pub retained_tokens: usize,
277}
278
279#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
280#[serde(default)]
281pub struct ContextEvalContextReport {
282 pub projection_policy: String,
283 pub tool_disclosure: String,
284 pub artifact_count: usize,
285 pub selected_artifact_ids: Vec<String>,
286 pub dropped_artifact_ids: Vec<String>,
287 pub rendered_bytes: usize,
288 pub rendered_tokens: usize,
289 pub budget_tokens: usize,
290 pub assemble_strategy: String,
291 pub dedup: String,
292 pub exposed_tools: Vec<String>,
293}
294
295#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
296#[serde(default)]
297pub struct ContextEvalCacheReport {
298 pub namespace: String,
299 pub key: String,
300 pub stable_input_hash: String,
301 pub deterministic_order: bool,
302 pub hit: Option<bool>,
303}
304
305struct PreparedModeRun {
306 artifacts: Vec<ArtifactRecord>,
307 rendered_context: String,
308 selected_artifact_ids: Vec<String>,
309 dropped_artifact_ids: Vec<String>,
310 projection: ContextEvalProjectionReport,
311 transcript_text: String,
312 exposed_tools: Vec<String>,
313 visible_tool_events: Vec<ContextEvalToolEvent>,
314}
315
316pub fn load_context_eval_manifest(path: &Path) -> Result<ContextEvalManifest, VmError> {
317 let content = std::fs::read_to_string(path).map_err(|error| {
318 VmError::Runtime(format!("failed to read context eval manifest: {error}"))
319 })?;
320 let mut manifest: ContextEvalManifest =
321 if path.extension().and_then(|ext| ext.to_str()) == Some("toml") {
322 toml::from_str(&content).map_err(|error| {
323 VmError::Runtime(format!("failed to parse context eval TOML: {error}"))
324 })?
325 } else {
326 serde_json::from_str(&content).map_err(|error| {
327 VmError::Runtime(format!("failed to parse context eval JSON: {error}"))
328 })?
329 };
330 normalize_context_eval_manifest(&mut manifest)?;
331 Ok(manifest)
332}
333
334pub fn evaluate_context_eval_manifest(
335 manifest: &ContextEvalManifest,
336) -> Result<ContextEvalReport, VmError> {
337 let mut manifest = manifest.clone();
338 normalize_context_eval_manifest(&mut manifest)?;
339
340 let modes = manifest
341 .modes
342 .iter()
343 .map(|mode| ContextEvalModeSummary {
344 id: mode.id.clone(),
345 kind: mode_kind(mode),
346 projection_policy: projection_policy(mode),
347 tool_disclosure: tool_disclosure(mode),
348 preprocessing: preprocessing_report(mode),
349 })
350 .collect::<Vec<_>>();
351 let tasks = manifest
352 .tasks
353 .iter()
354 .map(|task| ContextEvalTaskSummary {
355 id: task.id.clone(),
356 name: task.name.clone(),
357 required_terms: sorted_strings(&task.expected.required_terms),
358 expected_artifact_ids: sorted_strings(&task.expected.expected_artifact_ids),
359 expected_tools: sorted_strings(&task.expected.expected_tools),
360 })
361 .collect::<Vec<_>>();
362
363 let mut runs = Vec::new();
364 for task in &manifest.tasks {
365 for mode in &manifest.modes {
366 runs.push(evaluate_task_mode(task, mode)?);
367 }
368 }
369
370 let total_runs = runs.len();
371 let passed_runs = runs.iter().filter(|run| run.passed).count();
372 let failed_runs = total_runs.saturating_sub(passed_runs);
373 let aggregate = aggregate_runs(&runs);
374 Ok(ContextEvalReport {
375 type_name: CONTEXT_EVAL_REPORT_TYPE.to_string(),
376 schema_version: CONTEXT_EVAL_SCHEMA_VERSION,
377 manifest_id: manifest.id,
378 manifest_name: manifest.name,
379 pass: failed_runs == 0,
380 total_runs,
381 passed_runs,
382 failed_runs,
383 total_tasks: tasks.len(),
384 total_modes: modes.len(),
385 aggregate,
386 modes,
387 tasks,
388 runs,
389 metadata: manifest.metadata,
390 })
391}
392
393fn normalize_context_eval_manifest(manifest: &mut ContextEvalManifest) -> Result<(), VmError> {
394 if manifest.type_name.is_empty() {
395 manifest.type_name = CONTEXT_EVAL_MANIFEST_TYPE.to_string();
396 }
397 if manifest.type_name != CONTEXT_EVAL_MANIFEST_TYPE {
398 return Err(VmError::Runtime(format!(
399 "context eval manifest _type must be {CONTEXT_EVAL_MANIFEST_TYPE}"
400 )));
401 }
402 if manifest.version == 0 {
403 manifest.version = CONTEXT_EVAL_SCHEMA_VERSION;
404 }
405 if manifest.version != CONTEXT_EVAL_SCHEMA_VERSION {
406 return Err(VmError::Runtime(format!(
407 "context eval manifest version must be {CONTEXT_EVAL_SCHEMA_VERSION}"
408 )));
409 }
410 if manifest.id.trim().is_empty() {
411 manifest.id = "context-eval".to_string();
412 }
413 if manifest.modes.is_empty() {
414 return Err(VmError::Runtime(
415 "context eval manifest must declare at least one mode".to_string(),
416 ));
417 }
418 if manifest.tasks.is_empty() {
419 return Err(VmError::Runtime(
420 "context eval manifest must declare at least one task".to_string(),
421 ));
422 }
423 let mut mode_ids = BTreeSet::new();
424 for (index, mode) in manifest.modes.iter_mut().enumerate() {
425 if mode.id.trim().is_empty() {
426 mode.id = format!("mode_{}", index + 1);
427 }
428 if !mode_ids.insert(mode.id.clone()) {
429 return Err(VmError::Runtime(format!(
430 "context eval manifest has duplicate mode id '{}'",
431 mode.id
432 )));
433 }
434 if mode.kind.trim().is_empty() {
435 mode.kind = mode.id.clone();
436 }
437 }
438 let mut task_ids = BTreeSet::new();
439 for (index, task) in manifest.tasks.iter_mut().enumerate() {
440 if task.id.trim().is_empty() {
441 task.id = format!("task_{}", index + 1);
442 }
443 if !task_ids.insert(task.id.clone()) {
444 return Err(VmError::Runtime(format!(
445 "context eval manifest has duplicate task id '{}'",
446 task.id
447 )));
448 }
449 if task.objective.trim().is_empty() {
450 return Err(VmError::Runtime(format!(
451 "context eval task '{}' must declare objective",
452 task.id
453 )));
454 }
455 for (artifact_index, artifact) in task.artifacts.iter_mut().enumerate() {
456 normalize_eval_artifact(artifact, &task.id, artifact_index);
457 }
458 task.tools.sort_by(|left, right| left.name.cmp(&right.name));
459 task.tool_events.sort_by(|left, right| {
460 left.order
461 .unwrap_or(usize::MAX)
462 .cmp(&right.order.unwrap_or(usize::MAX))
463 .then_with(|| left.name.cmp(&right.name))
464 });
465 }
466 Ok(())
467}
468
469fn normalize_eval_artifact(artifact: &mut ArtifactRecord, task_id: &str, index: usize) {
470 if artifact.type_name.is_empty() {
471 artifact.type_name = "artifact".to_string();
472 }
473 if artifact.id.trim().is_empty() {
474 artifact.id = format!("{task_id}_artifact_{}", index + 1);
475 }
476 if artifact.kind.trim().is_empty() {
477 artifact.kind = "artifact".to_string();
478 }
479 if artifact.created_at.trim().is_empty() {
480 artifact.created_at = "1970-01-01T00:00:00Z".to_string();
481 }
482 if artifact.estimated_tokens.is_none() {
483 artifact.estimated_tokens = artifact
484 .text
485 .as_ref()
486 .map(|text| ((text.len() as f64) / 4.0).ceil() as usize);
487 }
488 if artifact.priority.is_none() {
489 artifact.priority = Some(40);
490 }
491}
492
493fn evaluate_task_mode(
494 task: &ContextEvalTask,
495 mode: &ContextEvalMode,
496) -> Result<ContextEvalRunReport, VmError> {
497 let prepared = prepare_mode_run(task, mode)?;
498 let mode_id = mode.id.clone();
499 let mode_kind = mode_kind(mode);
500 let observed = task
501 .mode_observations
502 .get(&mode_id)
503 .unwrap_or(&task.observed);
504 let visible_input = visible_input(task, &prepared);
505 let (final_surface, correctness_source) = observed
506 .final_response
507 .as_ref()
508 .map(|response| (response.as_str(), "final_response"))
509 .unwrap_or_else(|| (visible_input.as_str(), "context_projection"));
510 let final_correctness = score_correctness(
511 &task.expected,
512 final_surface,
513 &prepared.selected_artifact_ids,
514 correctness_source,
515 );
516 let tool_call_quality =
517 score_tools(&task.expected.expected_tools, &prepared.visible_tool_events);
518 let reads_before_first_edit = reads_before_first_edit(&prepared.visible_tool_events);
519 let error_recovery_count = error_recovery_count(&prepared.visible_tool_events);
520 let input_tokens = observed
521 .input_tokens
522 .unwrap_or_else(|| estimate_chunk_tokens(&visible_input));
523 let output_tokens = observed
524 .output_tokens
525 .or_else(|| {
526 task.reference_answer
527 .as_ref()
528 .map(|text| estimate_chunk_tokens(text))
529 })
530 .unwrap_or(0);
531 let compaction_count = observed.compaction_count.unwrap_or(0) + mode_compaction_count(mode);
532 let latency_ms = observed.latency_ms.unwrap_or(0);
533 let cost_usd = observed.cost_usd.unwrap_or(0.0);
534 let mut failures = Vec::new();
535 if !final_correctness.required_terms_missing.is_empty() {
536 failures.push(format!(
537 "missing required terms: {}",
538 final_correctness.required_terms_missing.join(", ")
539 ));
540 }
541 if !final_correctness.expected_artifact_ids_missing.is_empty() {
542 failures.push(format!(
543 "missing expected artifacts: {}",
544 final_correctness.expected_artifact_ids_missing.join(", ")
545 ));
546 }
547 if !tool_call_quality.missing_tools.is_empty() {
548 failures.push(format!(
549 "missing expected tools: {}",
550 tool_call_quality.missing_tools.join(", ")
551 ));
552 }
553 if let Some(max) = task.expected.max_input_tokens {
554 if input_tokens > max {
555 failures.push(format!("input tokens {input_tokens} exceed max {max}"));
556 }
557 }
558 let passed = failures.is_empty();
559 let stable_input_hash = stable_hash(&[
560 task.id.as_str(),
561 mode.id.as_str(),
562 &prepared.selected_artifact_ids.join("\n"),
563 prepared.rendered_context.as_str(),
564 prepared.transcript_text.as_str(),
565 &prepared.exposed_tools.join("\n"),
566 ]);
567 let cache_namespace = mode
568 .cache_namespace
569 .clone()
570 .unwrap_or_else(|| "harn.context_eval".to_string());
571 Ok(ContextEvalRunReport {
572 run_id: format!("{}__{}", task.id, mode.id),
573 task_id: task.id.clone(),
574 mode_id,
575 mode_kind,
576 status: if passed { "pass" } else { "fail" }.to_string(),
577 passed,
578 final_correctness,
579 reads_before_first_edit,
580 tool_call_quality,
581 latency_ms,
582 input_tokens,
583 output_tokens,
584 cost_usd,
585 compaction_count,
586 projection: prepared.projection,
587 context: ContextEvalContextReport {
588 projection_policy: projection_policy(mode),
589 tool_disclosure: tool_disclosure(mode),
590 artifact_count: prepared.artifacts.len(),
591 selected_artifact_ids: prepared.selected_artifact_ids,
592 dropped_artifact_ids: prepared.dropped_artifact_ids,
593 rendered_bytes: prepared.rendered_context.len(),
594 rendered_tokens: estimate_chunk_tokens(&prepared.rendered_context),
595 budget_tokens: mode_budget_tokens(mode),
596 assemble_strategy: assemble_strategy(mode)?.as_str().to_string(),
597 dedup: assemble_dedup(mode)?.as_str().to_string(),
598 exposed_tools: prepared.exposed_tools,
599 },
600 cache: ContextEvalCacheReport {
601 namespace: cache_namespace.clone(),
602 key: format!("{}:{}", cache_namespace, &stable_input_hash[..32]),
603 stable_input_hash,
604 deterministic_order: true,
605 hit: mode.expected_cache_hit.or(observed.cache_hit),
606 },
607 error_recovery_count,
608 preprocessing: preprocessing_report(mode),
609 failures,
610 })
611}
612
613fn prepare_mode_run(
614 task: &ContextEvalTask,
615 mode: &ContextEvalMode,
616) -> Result<PreparedModeRun, VmError> {
617 let filtered = filter_artifacts(task, mode);
618 let options = AssembleOptions {
619 budget_tokens: mode_budget_tokens(mode),
620 dedup: assemble_dedup(mode)?,
621 strategy: assemble_strategy(mode)?,
622 query: Some(task.objective.clone()),
623 microcompact_threshold: mode.microcompact_threshold.unwrap_or(2_000),
624 semantic_overlap: mode.semantic_overlap.unwrap_or(0.85),
625 };
626 let assembled = assemble_context(&filtered, &options, None);
627 let selected_artifact_ids = sorted_strings(
628 &assembled
629 .included
630 .iter()
631 .map(|item| item.artifact_id.clone())
632 .collect::<Vec<_>>(),
633 );
634 let dropped_artifact_ids = sorted_strings(
635 &assembled
636 .dropped
637 .iter()
638 .map(|item| item.artifact_id.clone())
639 .collect::<Vec<_>>(),
640 );
641 let rendered_context = if assembled.chunks.is_empty() {
642 String::new()
643 } else {
644 render_assembled_chunks(&assembled)
645 };
646 let (projection, transcript_text) = project_transcript(task, mode);
647 let exposed_tools = exposed_tools(task, mode);
648 let visible_tool_events = visible_tool_events(task, &exposed_tools, mode);
649 Ok(PreparedModeRun {
650 artifacts: filtered,
651 rendered_context,
652 selected_artifact_ids,
653 dropped_artifact_ids,
654 projection,
655 transcript_text,
656 exposed_tools,
657 visible_tool_events,
658 })
659}
660
661fn filter_artifacts(task: &ContextEvalTask, mode: &ContextEvalMode) -> Vec<ArtifactRecord> {
662 if mode_kind(mode) == "cold" || mode_budget_tokens(mode) == 0 {
663 return Vec::new();
664 }
665 let include_ids: BTreeSet<&str> = mode.artifact_ids.iter().map(String::as_str).collect();
666 let include_kinds: BTreeSet<&str> = mode
667 .include_artifact_kinds
668 .iter()
669 .map(String::as_str)
670 .collect();
671 let exclude_kinds: BTreeSet<&str> = mode
672 .exclude_artifact_kinds
673 .iter()
674 .map(String::as_str)
675 .collect();
676 let kind = mode_kind(mode);
677 task.artifacts
678 .iter()
679 .filter(|artifact| include_ids.is_empty() || include_ids.contains(artifact.id.as_str()))
680 .filter(|artifact| {
681 include_kinds.is_empty()
682 || include_kinds.contains(artifact.kind.as_str())
683 || include_kinds.contains(
684 artifact
685 .metadata
686 .get("context_tier")
687 .and_then(JsonValue::as_str)
688 .unwrap_or(""),
689 )
690 })
691 .filter(|artifact| !exclude_kinds.contains(artifact.kind.as_str()))
692 .filter(|artifact| {
693 default_mode_allows_artifact(
694 &kind,
695 artifact,
696 include_ids.is_empty() && include_kinds.is_empty(),
697 )
698 })
699 .cloned()
700 .collect()
701}
702
703fn default_mode_allows_artifact(
704 kind: &str,
705 artifact: &ArtifactRecord,
706 using_default_filter: bool,
707) -> bool {
708 if !using_default_filter {
709 return true;
710 }
711 match kind {
712 "cold" => false,
713 "scanned" => artifact_matches_tier(artifact, &["scan", "scanned", "tier1_scan"]),
714 "enriched" => artifact_matches_tier(
715 artifact,
716 &[
717 "scan",
718 "scanned",
719 "tier1_scan",
720 "enrichment",
721 "enriched",
722 "tier2_enrichment",
723 ],
724 ),
725 _ => true,
726 }
727}
728
729fn artifact_matches_tier(artifact: &ArtifactRecord, labels: &[&str]) -> bool {
730 labels.iter().any(|label| {
731 artifact.kind == *label
732 || artifact
733 .metadata
734 .get("context_tier")
735 .and_then(JsonValue::as_str)
736 == Some(*label)
737 })
738}
739
740fn project_transcript(
741 task: &ContextEvalTask,
742 mode: &ContextEvalMode,
743) -> (ContextEvalProjectionReport, String) {
744 let policy = projection_policy(mode);
745 let keep_last = mode.transcript_keep_last.unwrap_or(match policy.as_str() {
746 "none" => 0,
747 "summary" | "compacted" => 1,
748 "last_n" | "projected" => 2,
749 _ => task.transcript.len(),
750 });
751 let retained: Vec<&ContextEvalTranscriptMessage> = match policy.as_str() {
752 "none" => Vec::new(),
753 "full" => task.transcript.iter().collect(),
754 "summary" | "compacted" | "last_n" | "projected" => task
755 .transcript
756 .iter()
757 .rev()
758 .take(keep_last)
759 .collect::<Vec<_>>()
760 .into_iter()
761 .rev()
762 .collect(),
763 _ => task.transcript.iter().collect(),
764 };
765 let transcript_text = retained
766 .iter()
767 .map(|message| format!("{}: {}", message.role, message.content))
768 .collect::<Vec<_>>()
769 .join("\n");
770 let retained_tokens = retained
771 .iter()
772 .map(|message| {
773 message
774 .estimated_tokens
775 .unwrap_or_else(|| estimate_chunk_tokens(&message.content))
776 })
777 .sum();
778 (
779 ContextEvalProjectionReport {
780 policy,
781 source_message_count: task.transcript.len(),
782 retained_message_count: retained.len(),
783 retained_tokens,
784 },
785 transcript_text,
786 )
787}
788
789fn exposed_tools(task: &ContextEvalTask, mode: &ContextEvalMode) -> Vec<String> {
790 let disclosure = tool_disclosure(mode);
791 let allowlist: BTreeSet<&str> = mode.tool_allowlist.iter().map(String::as_str).collect();
792 let mut names = match disclosure.as_str() {
793 "none" => Vec::new(),
794 "full" => task.tools.iter().map(|tool| tool.name.clone()).collect(),
795 "limited" | "tool_search_limited" => task
796 .tools
797 .iter()
798 .filter(|tool| allowlist.contains(tool.name.as_str()))
799 .map(|tool| tool.name.clone())
800 .collect(),
801 _ => task.tools.iter().map(|tool| tool.name.clone()).collect(),
802 };
803 if disclosure == "tool_search_limited" && !names.iter().any(|name| name == "tool_search") {
804 names.push("tool_search".to_string());
805 }
806 sorted_strings(&names)
807}
808
809fn visible_tool_events(
810 task: &ContextEvalTask,
811 exposed_tools: &[String],
812 mode: &ContextEvalMode,
813) -> Vec<ContextEvalToolEvent> {
814 if tool_disclosure(mode) == "full" {
815 return task.tool_events.clone();
816 }
817 let exposed: BTreeSet<&str> = exposed_tools.iter().map(String::as_str).collect();
818 task.tool_events
819 .iter()
820 .filter(|event| exposed.contains(event.name.as_str()))
821 .cloned()
822 .collect()
823}
824
825fn visible_input(task: &ContextEvalTask, prepared: &PreparedModeRun) -> String {
826 [
827 task.objective.as_str(),
828 prepared.rendered_context.as_str(),
829 prepared.transcript_text.as_str(),
830 &prepared.exposed_tools.join("\n"),
831 ]
832 .into_iter()
833 .filter(|part| !part.trim().is_empty())
834 .collect::<Vec<_>>()
835 .join("\n\n")
836}
837
838fn score_correctness(
839 expected: &ContextEvalExpected,
840 surface: &str,
841 selected_artifact_ids: &[String],
842 source: &str,
843) -> ContextEvalCorrectness {
844 let lower_surface = surface.to_ascii_lowercase();
845 let mut present_terms = Vec::new();
846 let mut missing_terms = Vec::new();
847 for term in sorted_strings(&expected.required_terms) {
848 if lower_surface.contains(&term.to_ascii_lowercase()) {
849 present_terms.push(term);
850 } else {
851 missing_terms.push(term);
852 }
853 }
854 let selected: BTreeSet<&str> = selected_artifact_ids.iter().map(String::as_str).collect();
855 let mut present_artifacts = Vec::new();
856 let mut missing_artifacts = Vec::new();
857 for id in sorted_strings(&expected.expected_artifact_ids) {
858 if selected.contains(id.as_str()) {
859 present_artifacts.push(id);
860 } else {
861 missing_artifacts.push(id);
862 }
863 }
864 let term_score = fraction(
865 present_terms.len(),
866 present_terms.len() + missing_terms.len(),
867 );
868 let artifact_score = fraction(
869 present_artifacts.len(),
870 present_artifacts.len() + missing_artifacts.len(),
871 );
872 let score = if expected.required_terms.is_empty() && expected.expected_artifact_ids.is_empty() {
873 1.0
874 } else if expected.required_terms.is_empty() || expected.expected_artifact_ids.is_empty() {
875 term_score.max(artifact_score)
876 } else {
877 (term_score + artifact_score) / 2.0
878 };
879 ContextEvalCorrectness {
880 passed: missing_terms.is_empty() && missing_artifacts.is_empty(),
881 score: round4(score),
882 required_terms_present: present_terms,
883 required_terms_missing: missing_terms,
884 expected_artifact_ids_present: present_artifacts,
885 expected_artifact_ids_missing: missing_artifacts,
886 source: source.to_string(),
887 }
888}
889
890fn score_tools(
891 expected_tools: &[String],
892 events: &[ContextEvalToolEvent],
893) -> ContextEvalToolQuality {
894 let expected = sorted_strings(expected_tools);
895 let expected_set: BTreeSet<&str> = expected.iter().map(String::as_str).collect();
896 let observed = sorted_strings(
897 &events
898 .iter()
899 .map(|event| event.name.clone())
900 .collect::<Vec<_>>(),
901 );
902 let observed_set: BTreeSet<&str> = observed.iter().map(String::as_str).collect();
903 let matched_tools = expected
904 .iter()
905 .filter(|tool| observed_set.contains(tool.as_str()))
906 .cloned()
907 .collect::<Vec<_>>();
908 let missing_tools = expected
909 .iter()
910 .filter(|tool| !observed_set.contains(tool.as_str()))
911 .cloned()
912 .collect::<Vec<_>>();
913 let unnecessary_tools = observed
914 .iter()
915 .filter(|tool| !expected_set.contains(tool.as_str()) && !is_edit_tool(tool))
916 .cloned()
917 .collect::<Vec<_>>();
918 let denominator = expected.len() + unnecessary_tools.len();
919 let score = if denominator == 0 {
920 1.0
921 } else {
922 matched_tools.len() as f64 / denominator as f64
923 };
924 ContextEvalToolQuality {
925 score: round4(score),
926 expected_tools: expected,
927 observed_tools: observed,
928 matched_tools,
929 missing_tools,
930 unnecessary_tools,
931 }
932}
933
934fn reads_before_first_edit(events: &[ContextEvalToolEvent]) -> usize {
935 let mut reads = 0;
936 for event in events {
937 if is_edit_event(event) {
938 break;
939 }
940 if is_read_event(event) {
941 reads += 1;
942 }
943 }
944 reads
945}
946
947fn error_recovery_count(events: &[ContextEvalToolEvent]) -> usize {
948 events
949 .iter()
950 .filter(|event| {
951 event.recovery == Some(true)
952 || event
953 .phase
954 .as_deref()
955 .is_some_and(|phase| phase.contains("recovery") || phase.contains("error"))
956 || event.quality.as_deref() == Some("recovery")
957 })
958 .count()
959}
960
961fn aggregate_runs(runs: &[ContextEvalRunReport]) -> ContextEvalAggregate {
962 let total = runs.len();
963 let mean_final_correctness = mean(total, runs.iter().map(|run| run.final_correctness.score));
964 let mean_tool_call_quality = mean(total, runs.iter().map(|run| run.tool_call_quality.score));
965 ContextEvalAggregate {
966 mean_final_correctness,
967 mean_tool_call_quality,
968 total_latency_ms: runs.iter().map(|run| run.latency_ms).sum(),
969 total_input_tokens: runs.iter().map(|run| run.input_tokens).sum(),
970 total_output_tokens: runs.iter().map(|run| run.output_tokens).sum(),
971 total_cost_usd: round6(runs.iter().map(|run| run.cost_usd).sum()),
972 total_compaction_count: runs.iter().map(|run| run.compaction_count).sum(),
973 total_error_recovery_count: runs.iter().map(|run| run.error_recovery_count).sum(),
974 }
975}
976
977fn mode_kind(mode: &ContextEvalMode) -> String {
978 let value = mode.kind.trim();
979 if value.is_empty() {
980 mode.id.clone()
981 } else {
982 value.to_string()
983 }
984}
985
986fn mode_budget_tokens(mode: &ContextEvalMode) -> usize {
987 mode.budget_tokens
988 .unwrap_or_else(|| match mode_kind(mode).as_str() {
989 "cold" => 0,
990 "scanned" => 800,
991 "enriched" => 1_200,
992 "hud_pack" | "projected" | "compacted" | "tool_search_limited" => 1_600,
993 "full" => 64_000,
994 _ => 8_000,
995 })
996}
997
998fn assemble_strategy(mode: &ContextEvalMode) -> Result<AssembleStrategy, VmError> {
999 mode.assemble_strategy
1000 .as_deref()
1001 .map(AssembleStrategy::parse)
1002 .transpose()
1003 .map_err(VmError::Runtime)
1004 .map(|value| value.unwrap_or(AssembleStrategy::Relevance))
1005}
1006
1007fn assemble_dedup(mode: &ContextEvalMode) -> Result<AssembleDedup, VmError> {
1008 mode.dedup
1009 .as_deref()
1010 .map(AssembleDedup::parse)
1011 .transpose()
1012 .map_err(VmError::Runtime)
1013 .map(|value| value.unwrap_or(AssembleDedup::Chunked))
1014}
1015
1016fn projection_policy(mode: &ContextEvalMode) -> String {
1017 mode.projection_policy
1018 .clone()
1019 .unwrap_or_else(|| match mode_kind(mode).as_str() {
1020 "cold" | "scanned" | "enriched" | "hud_pack" | "tool_search_limited" => {
1021 "none".to_string()
1022 }
1023 "projected" => "last_n".to_string(),
1024 "compacted" => "compacted".to_string(),
1025 "full" => "full".to_string(),
1026 _ => "none".to_string(),
1027 })
1028}
1029
1030fn tool_disclosure(mode: &ContextEvalMode) -> String {
1031 mode.tool_disclosure
1032 .clone()
1033 .unwrap_or_else(|| match mode_kind(mode).as_str() {
1034 "cold" | "scanned" | "enriched" | "hud_pack" => "none".to_string(),
1035 "tool_search_limited" => "tool_search_limited".to_string(),
1036 "full" => "full".to_string(),
1037 _ => "limited".to_string(),
1038 })
1039}
1040
1041fn preprocessing_report(mode: &ContextEvalMode) -> ContextEvalPreprocessing {
1042 let preprocessing = mode
1043 .preprocessing
1044 .clone()
1045 .unwrap_or_else(|| "deterministic".to_string());
1046 ContextEvalPreprocessing {
1047 llm_enabled: preprocessing == "llm",
1048 mode: preprocessing,
1049 }
1050}
1051
1052fn mode_compaction_count(mode: &ContextEvalMode) -> usize {
1053 usize::from(mode_kind(mode) == "compacted" || mode.compaction_policy.is_some())
1054}
1055
1056fn is_read_event(event: &ContextEvalToolEvent) -> bool {
1057 event
1058 .phase
1059 .as_deref()
1060 .is_some_and(|phase| phase == "read" || phase == "scan")
1061 || event.name.starts_with("read")
1062 || event.name.starts_with("search")
1063 || event.name.starts_with("list")
1064}
1065
1066fn is_edit_event(event: &ContextEvalToolEvent) -> bool {
1067 event
1068 .phase
1069 .as_deref()
1070 .is_some_and(|phase| phase == "edit" || phase == "write" || phase == "mutation")
1071 || is_edit_tool(&event.name)
1072}
1073
1074fn is_edit_tool(name: &str) -> bool {
1075 name.starts_with("edit")
1076 || name.starts_with("write")
1077 || name.starts_with("apply")
1078 || name.contains("patch")
1079}
1080
1081fn fraction(numerator: usize, denominator: usize) -> f64 {
1082 if denominator == 0 {
1083 1.0
1084 } else {
1085 numerator as f64 / denominator as f64
1086 }
1087}
1088
1089fn mean(total: usize, values: impl Iterator<Item = f64>) -> f64 {
1090 if total == 0 {
1091 0.0
1092 } else {
1093 round4(values.sum::<f64>() / total as f64)
1094 }
1095}
1096
1097fn sorted_strings(values: &[String]) -> Vec<String> {
1098 values
1099 .iter()
1100 .map(|value| value.trim())
1101 .filter(|value| !value.is_empty())
1102 .map(ToOwned::to_owned)
1103 .collect::<BTreeSet<_>>()
1104 .into_iter()
1105 .collect()
1106}
1107
1108fn stable_hash(parts: &[&str]) -> String {
1109 let mut hasher = Sha256::new();
1110 for part in parts {
1111 hasher.update((part.len() as u64).to_le_bytes());
1112 hasher.update(part.as_bytes());
1113 }
1114 hasher
1115 .finalize()
1116 .iter()
1117 .map(|byte| format!("{byte:02x}"))
1118 .collect()
1119}
1120
1121fn round4(value: f64) -> f64 {
1122 (value * 10_000.0).round() / 10_000.0
1123}
1124
1125fn round6(value: f64) -> f64 {
1126 (value * 1_000_000.0).round() / 1_000_000.0
1127}
1128
1129pub fn context_eval_default_output_dir() -> PathBuf {
1130 PathBuf::from(".harn-runs/context-eval/latest")
1131}
1132
1133#[cfg(test)]
1134mod tests {
1135 use super::*;
1136
1137 fn artifact(id: &str, kind: &str, text: &str) -> ArtifactRecord {
1138 ArtifactRecord {
1139 type_name: "artifact".to_string(),
1140 id: id.to_string(),
1141 kind: kind.to_string(),
1142 title: Some(id.to_string()),
1143 text: Some(text.to_string()),
1144 data: None,
1145 source: Some("fixture".to_string()),
1146 created_at: "2026-05-23T00:00:00Z".to_string(),
1147 freshness: Some("fresh".to_string()),
1148 priority: Some(80),
1149 lineage: Vec::new(),
1150 relevance: Some(1.0),
1151 estimated_tokens: None,
1152 stage: None,
1153 metadata: BTreeMap::new(),
1154 }
1155 }
1156
1157 #[test]
1158 fn context_eval_scores_modes_deterministically() {
1159 let manifest = ContextEvalManifest {
1160 type_name: CONTEXT_EVAL_MANIFEST_TYPE.to_string(),
1161 version: 1,
1162 id: "smoke".to_string(),
1163 modes: vec![
1164 ContextEvalMode {
1165 id: "cold".to_string(),
1166 kind: "cold".to_string(),
1167 ..Default::default()
1168 },
1169 ContextEvalMode {
1170 id: "pack".to_string(),
1171 kind: "hud_pack".to_string(),
1172 artifact_ids: vec!["runbook".to_string()],
1173 tool_disclosure: Some("limited".to_string()),
1174 tool_allowlist: vec!["read_file".to_string()],
1175 ..Default::default()
1176 },
1177 ],
1178 tasks: vec![ContextEvalTask {
1179 id: "task".to_string(),
1180 objective: "Find the rollback command".to_string(),
1181 artifacts: vec![artifact(
1182 "runbook",
1183 "context_pack",
1184 "Use deploy rollback now.",
1185 )],
1186 tools: vec![ContextEvalTool {
1187 name: "read_file".to_string(),
1188 ..Default::default()
1189 }],
1190 tool_events: vec![ContextEvalToolEvent {
1191 order: Some(1),
1192 name: "read_file".to_string(),
1193 phase: Some("read".to_string()),
1194 success: Some(true),
1195 quality: Some("useful".to_string()),
1196 recovery: None,
1197 }],
1198 expected: ContextEvalExpected {
1199 required_terms: vec!["deploy rollback".to_string()],
1200 expected_artifact_ids: vec!["runbook".to_string()],
1201 expected_tools: vec!["read_file".to_string()],
1202 ..Default::default()
1203 },
1204 ..Default::default()
1205 }],
1206 ..Default::default()
1207 };
1208
1209 let report = evaluate_context_eval_manifest(&manifest).expect("context eval succeeds");
1210 assert_eq!(report.total_runs, 2);
1211 assert_eq!(report.passed_runs, 1);
1212 assert!(!report.runs[0].passed);
1213 assert!(report.runs[1].passed);
1214 assert_eq!(report.runs[1].reads_before_first_edit, 1);
1215 assert_eq!(report.runs[1].tool_call_quality.score, 1.0);
1216 assert_eq!(report.runs[1].cache.stable_input_hash.len(), 64);
1217 }
1218}