1use std::collections::{BTreeMap, BTreeSet};
13use std::fs;
14use std::io::Write as _;
15use std::path::{Path, PathBuf};
16
17use harn_vm::llm_config;
18use harn_vm::stdlib::template::{
19 render_template_to_string_with_branch_trace, BranchDecision, LlmRenderContext,
20 LlmRenderContextGuard,
21};
22use harn_vm::value::VmValue;
23use serde_json::Value as JsonValue;
24
25use crate::cli::{EvalPromptArgs, EvalPromptMode, EvalPromptOutput};
26use crate::config;
27
28use super::eval_prompt_context::{evaluate_context_fixtures, PromptContextEvalReport};
29
30#[derive(Debug, Clone, serde::Serialize)]
32struct ModelRender {
33 selector: String,
35 provider: String,
36 model: String,
37 family: String,
38 capabilities: JsonValue,
39 rendered: Option<String>,
41 error: Option<String>,
42 #[serde(default, skip_serializing_if = "Vec::is_empty")]
43 branches: Vec<TemplateBranch>,
44 auth_available: bool,
48}
49
50#[derive(Debug, Clone, serde::Serialize, Default)]
52struct ModelRunResult {
53 response: Option<String>,
54 error: Option<String>,
55 skipped: bool,
57}
58
59#[derive(Debug, Clone, serde::Serialize)]
60struct PromptReport {
61 template_path: PathBuf,
62 mode: &'static str,
63 renders: Vec<ModelRender>,
64 #[serde(skip_serializing_if = "BTreeMap::is_empty")]
65 runs: BTreeMap<String, ModelRunResult>,
66 #[serde(skip_serializing_if = "Option::is_none")]
67 judge: Option<JudgeReport>,
68 #[serde(skip_serializing_if = "Option::is_none")]
69 context_eval: Option<PromptContextEvalReport>,
70}
71
72#[derive(Debug, Clone, serde::Serialize)]
73struct JudgeReport {
74 judge_model: String,
75 verdict: String,
79}
80
81#[derive(Debug, Clone, serde::Serialize)]
82struct TemplateBranch {
83 kind: String,
84 template_uri: String,
85 line: usize,
86 col: usize,
87 branch_id: String,
88 #[serde(skip_serializing_if = "Option::is_none")]
89 branch_label: Option<String>,
90}
91
92impl From<&BranchDecision> for TemplateBranch {
93 fn from(decision: &BranchDecision) -> Self {
94 Self {
95 kind: decision.kind.as_str().to_string(),
96 template_uri: decision.template_uri.clone(),
97 line: decision.line,
98 col: decision.col,
99 branch_id: decision.branch_id.clone(),
100 branch_label: decision.branch_label.clone(),
101 }
102 }
103}
104
105pub async fn run(args: EvalPromptArgs) -> i32 {
106 let template_path = match fs::canonicalize(&args.file) {
107 Ok(p) => p,
108 Err(error) => {
109 eprintln!(
110 "error: cannot resolve template path {}: {error}",
111 args.file.display()
112 );
113 return 1;
114 }
115 };
116 let template_source = match fs::read_to_string(&template_path) {
117 Ok(s) => s,
118 Err(error) => {
119 eprintln!("error: failed to read {}: {error}", template_path.display());
120 return 1;
121 }
122 };
123
124 let fleet = match resolve_fleet(&args, &template_path) {
125 Ok(f) => f,
126 Err(error) => {
127 eprintln!("error: {error}");
128 return 2;
129 }
130 };
131 if fleet.is_empty() {
132 eprintln!("error: fleet is empty — supply `--fleet <models>` or `--fleet-name <name>`");
133 return 2;
134 }
135
136 let bindings = match load_bindings(args.bindings.as_deref()) {
137 Ok(b) => b,
138 Err(error) => {
139 eprintln!("error: {error}");
140 return 1;
141 }
142 };
143
144 let renders = render_fleet(&fleet, &template_source, &template_path, bindings.as_ref());
145
146 let mode = args.mode;
147 let mut report = PromptReport {
148 template_path: template_path.clone(),
149 mode: mode_label(mode),
150 renders,
151 runs: BTreeMap::new(),
152 judge: None,
153 context_eval: None,
154 };
155
156 if !args.context_fixture.is_empty() {
157 match evaluate_context_fixtures(
158 &args.context_fixture,
159 &fleet,
160 &template_source,
161 &template_path,
162 bindings.as_ref(),
163 ) {
164 Ok(context_eval) => report.context_eval = Some(context_eval),
165 Err(error) => {
166 eprintln!("error: {error}");
167 return 1;
168 }
169 }
170 }
171
172 if matches!(mode, EvalPromptMode::Run | EvalPromptMode::Judge) {
173 let bindings_text = args
174 .bindings
175 .as_ref()
176 .map(|p| p.to_string_lossy().to_string());
177 let outputs = execute_runs(
178 &report.renders,
179 &template_path,
180 bindings_text.as_deref(),
181 args.max_tokens,
182 args.max_concurrent,
183 args.fail_on_unauthorized,
184 )
185 .await;
186 match outputs {
187 Ok(map) => report.runs = map,
188 Err(code) => return code,
189 }
190 }
191
192 if matches!(mode, EvalPromptMode::Judge) {
193 match execute_judge(
194 &report,
195 args.judge_template.as_deref(),
196 &args.judge_model,
197 args.max_tokens,
198 )
199 .await
200 {
201 Ok(judge) => report.judge = Some(judge),
202 Err(code) => return code,
203 }
204 }
205
206 let payload = match args.output {
207 EvalPromptOutput::Terminal => render_terminal(&report),
208 EvalPromptOutput::Json => render_json(&report),
209 EvalPromptOutput::Html => render_html(&report),
210 };
211
212 match args.out_file {
213 Some(path) => {
214 if let Err(error) = fs::write(&path, payload) {
215 eprintln!("error: failed to write {}: {error}", path.display());
216 return 1;
217 }
218 eprintln!("wrote {}", path.display());
219 }
220 None => {
221 let mut stdout = std::io::stdout().lock();
222 let _ = stdout.write_all(payload.as_bytes());
223 }
224 }
225
226 let context_eval_active = report.context_eval.is_some();
227 if !context_eval_active && report.renders.iter().any(|r| r.error.is_some()) {
228 return 1;
229 }
230 if report.runs.values().any(|r| r.error.is_some()) {
231 return 1;
232 }
233 if report
234 .context_eval
235 .as_ref()
236 .is_some_and(|context_eval| !context_eval.pass)
237 {
238 return 1;
239 }
240 0
241}
242
243fn mode_label(mode: EvalPromptMode) -> &'static str {
244 match mode {
245 EvalPromptMode::Render => "render",
246 EvalPromptMode::Run => "run",
247 EvalPromptMode::Judge => "judge",
248 }
249}
250
251fn resolve_fleet(args: &EvalPromptArgs, template_path: &Path) -> Result<Vec<FleetEntry>, String> {
255 let raw_selectors: Vec<String> = if let Some(name) = args.fleet_name.as_ref() {
256 let cfg = config::load_for_path(template_path)
257 .map_err(|error| format!("failed to load harn.toml: {error}"))?;
258 let Some(fleet) = cfg.eval.fleets.get(name) else {
259 let available: Vec<&str> = cfg.eval.fleets.keys().map(|s| s.as_str()).collect();
260 return Err(if available.is_empty() {
261 format!("unknown fleet `{name}` — no `[eval.fleets.*]` entries found in harn.toml",)
262 } else {
263 format!(
264 "unknown fleet `{name}` — known fleets: {}",
265 available.join(", "),
266 )
267 });
268 };
269 fleet.models.clone()
270 } else {
271 args.fleet.clone()
272 };
273
274 let mut seen = BTreeSet::new();
275 let mut out = Vec::new();
276 for selector in raw_selectors {
277 let trimmed = selector.trim();
278 if trimmed.is_empty() {
279 continue;
280 }
281 if !seen.insert(trimmed.to_string()) {
282 continue;
283 }
284 let resolved = llm_config::resolve_model_info(trimmed);
285 out.push(FleetEntry {
286 selector: trimmed.to_string(),
287 provider: resolved.provider,
288 model: resolved.id,
289 });
290 }
291 Ok(out)
292}
293
294#[derive(Debug, Clone)]
295pub(crate) struct FleetEntry {
296 pub(crate) selector: String,
297 pub(crate) provider: String,
298 pub(crate) model: String,
299}
300
301fn load_bindings(path: Option<&Path>) -> Result<Option<VmValue>, String> {
302 let Some(path) = path else {
303 return Ok(None);
304 };
305 let raw = fs::read_to_string(path)
306 .map_err(|error| format!("failed to read bindings {}: {error}", path.display()))?;
307 let json: JsonValue = serde_json::from_str(&raw)
308 .map_err(|error| format!("failed to parse bindings {}: {error}", path.display()))?;
309 if !json.is_object() {
310 return Err(format!(
311 "bindings file {} must be a JSON object at the top level",
312 path.display(),
313 ));
314 }
315 Ok(Some(harn_vm::json_to_vm_value(&json)))
316}
317
318fn render_fleet(
319 fleet: &[FleetEntry],
320 template_source: &str,
321 template_path: &Path,
322 bindings: Option<&VmValue>,
323) -> Vec<ModelRender> {
324 let base = template_path.parent();
325 let bindings_dict: Option<BTreeMap<String, VmValue>> = bindings.and_then(|v| match v {
326 VmValue::Dict(dict) => Some(dict.as_ref().clone()),
327 _ => None,
328 });
329
330 fleet
331 .iter()
332 .map(|entry| {
333 let ctx = LlmRenderContext::resolve(&entry.provider, &entry.model);
337 let family = ctx.family.clone();
338 let capabilities = vm_value_to_json(&ctx.capabilities);
339 let auth_available = llm_config::provider_key_available(&entry.provider);
340
341 let result = {
342 let _guard = LlmRenderContextGuard::enter(ctx);
343 render_template_to_string_with_branch_trace(
344 template_source,
345 bindings_dict.as_ref(),
346 base,
347 Some(template_path),
348 )
349 };
350
351 let (rendered, branches, error) = match result {
352 Ok((text, trace)) => (
353 Some(text),
354 trace.iter().map(TemplateBranch::from).collect(),
355 None,
356 ),
357 Err(message) => (None, Vec::new(), Some(message)),
358 };
359
360 ModelRender {
361 selector: entry.selector.clone(),
362 provider: entry.provider.clone(),
363 model: entry.model.clone(),
364 family,
365 capabilities,
366 rendered,
367 error,
368 branches,
369 auth_available,
370 }
371 })
372 .collect()
373}
374
375fn vm_value_to_json(value: &VmValue) -> JsonValue {
376 match value {
377 VmValue::Nil => JsonValue::Null,
378 VmValue::Bool(b) => JsonValue::Bool(*b),
379 VmValue::Int(i) => JsonValue::Number((*i).into()),
380 VmValue::Float(f) => serde_json::Number::from_f64(*f)
381 .map(JsonValue::Number)
382 .unwrap_or(JsonValue::Null),
383 VmValue::String(s) => JsonValue::String(s.to_string()),
384 VmValue::List(items) => JsonValue::Array(items.iter().map(vm_value_to_json).collect()),
385 VmValue::Dict(d) => {
386 let mut map = serde_json::Map::new();
387 for (k, v) in d.iter() {
388 map.insert(k.clone(), vm_value_to_json(v));
389 }
390 JsonValue::Object(map)
391 }
392 other => JsonValue::String(format!("<{}>", other.type_name())),
396 }
397}
398
399async fn execute_runs(
402 renders: &[ModelRender],
403 template_path: &Path,
404 bindings_path: Option<&str>,
405 max_tokens: i64,
406 max_concurrent: usize,
407 fail_on_unauthorized: bool,
408) -> Result<BTreeMap<String, ModelRunResult>, i32> {
409 let mut runnable: Vec<&ModelRender> = Vec::new();
410 let mut runs: BTreeMap<String, ModelRunResult> = BTreeMap::new();
411 let mock_active = std::env::var("HARN_LLM_PROVIDER")
412 .map(|v| v == "mock")
413 .unwrap_or(false);
414 for render in renders {
415 if render.error.is_some() {
416 runs.insert(
417 render.selector.clone(),
418 ModelRunResult {
419 error: Some("template render failed — see render section".to_string()),
420 ..Default::default()
421 },
422 );
423 continue;
424 }
425 if !mock_active && !render.auth_available {
426 if fail_on_unauthorized {
427 eprintln!(
428 "error: provider `{}` (for `{}`) has no credentials configured",
429 render.provider, render.selector,
430 );
431 return Err(1);
432 }
433 eprintln!(
434 "warn: provider `{}` (for `{}`) unauthenticated — skipping run",
435 render.provider, render.selector,
436 );
437 runs.insert(
438 render.selector.clone(),
439 ModelRunResult {
440 skipped: true,
441 ..Default::default()
442 },
443 );
444 continue;
445 }
446 runnable.push(render);
447 }
448 if runnable.is_empty() {
449 return Ok(runs);
450 }
451
452 let script = build_run_script(
453 &runnable,
454 template_path,
455 bindings_path,
456 max_tokens,
457 max_concurrent.max(1),
458 );
459 let outputs = match invoke_harn_script(&script).await {
460 Ok(out) => out,
461 Err(err) => {
462 eprintln!("error: run-mode harn script failed: {err}");
463 return Err(1);
464 }
465 };
466 for line in outputs.lines() {
467 if line.trim().is_empty() {
468 continue;
469 }
470 let entry: HarnRunLine = match serde_json::from_str(line) {
471 Ok(e) => e,
472 Err(_) => continue,
473 };
474 let result = ModelRunResult {
475 response: entry.response,
476 error: entry.error,
477 skipped: false,
478 };
479 runs.insert(entry.selector, result);
480 }
481 Ok(runs)
482}
483
484#[derive(Debug, serde::Deserialize)]
485struct HarnRunLine {
486 selector: String,
487 #[serde(default)]
488 response: Option<String>,
489 #[serde(default)]
490 error: Option<String>,
491}
492
493fn build_run_script(
494 fleet: &[&ModelRender],
495 template_path: &Path,
496 bindings_path: Option<&str>,
497 max_tokens: i64,
498 _max_concurrent: usize,
499) -> String {
500 let template_path_lit = json_string_literal(&template_path.to_string_lossy());
508 let bindings_load = if let Some(path) = bindings_path {
509 let path_lit = json_string_literal(path);
510 format!(" let bindings = json_parse(read_file({path_lit}))\n")
511 } else {
512 " let bindings = {}\n".to_string()
513 };
514 let fleet_items: Vec<String> = fleet
515 .iter()
516 .map(|r| {
517 format!(
518 " {{selector: {}, provider: {}, model: {}}}",
519 json_string_literal(&r.selector),
520 json_string_literal(&r.provider),
521 json_string_literal(&r.model),
522 )
523 })
524 .collect();
525 let fleet_list = if fleet_items.is_empty() {
526 "[]".to_string()
527 } else {
528 format!("[\n{}\n ]", fleet_items.join(",\n"))
529 };
530
531 format!(
532 "pipeline main() {{\n\
533{bindings_load}\
534 let fleet = {fleet_list}\n\
535 for entry in fleet {{\n\
536 let pushed = __push_llm_render_context(entry.provider, entry.model)\n\
537 let rendered = render({template_path_lit}, bindings)\n\
538 try {{\n\
539 let resp = llm_call(rendered, nil, {{\n\
540 provider: entry.provider,\n\
541 model: entry.model,\n\
542 max_tokens: {max_tokens}\n\
543 }})\n\
544 println(json_stringify({{selector: entry.selector, response: resp}}))\n\
545 }} catch (err) {{\n\
546 println(json_stringify({{selector: entry.selector, error: to_string(err)}}))\n\
547 }}\n\
548 if pushed {{\n\
549 __pop_llm_render_context()\n\
550 }}\n\
551 }}\n\
552}}\n",
553 )
554}
555
556async fn invoke_harn_script(script: &str) -> Result<String, String> {
557 use std::collections::HashSet;
558 let tmp = tempfile::Builder::new()
559 .prefix("harn-eval-prompt-")
560 .suffix(".harn")
561 .tempfile()
562 .map_err(|e| format!("tempfile: {e}"))?;
563 fs::write(tmp.path(), script).map_err(|e| format!("write tempfile: {e}"))?;
564
565 let outcome = crate::commands::run::execute_run(
566 &tmp.path().to_string_lossy(),
567 false,
568 HashSet::new(),
569 Vec::new(),
570 Vec::new(),
571 crate::commands::run::CliLlmMockMode::Off,
572 None,
573 crate::commands::run::RunProfileOptions::default(),
574 )
575 .await;
576
577 if outcome.exit_code != 0 {
578 return Err(format!(
579 "harn run exited {} — stderr:\n{}",
580 outcome.exit_code, outcome.stderr,
581 ));
582 }
583 Ok(outcome.stdout)
584}
585
586fn json_string_literal(value: &str) -> String {
587 serde_json::Value::String(value.to_string()).to_string()
588}
589
590const DEFAULT_JUDGE_TEMPLATE: &str = r#"You are a strict-equivalence judge for prompt-engineering output.
593
594The same logical prompt was rendered for several models and each model returned a response. Your task is to determine whether the responses are *semantically equivalent* — the wire envelope may differ (XML vs markdown vs native tool calls), but the user-facing intent and information content should be the same.
595
596Source prompt template (for context):
597
598{{ template_source }}
599
600Per-model responses:
601{{ for entry in entries }}
602---
603model: {{ entry.selector }} (provider={{ entry.provider }}, family={{ entry.family }})
604
605rendered prompt:
606{{ entry.rendered }}
607
608response:
609{{ entry.response }}
610{{ end }}
611
612Reply with a short JSON object on a single line of the form:
613{"equivalent": true|false, "differences": ["..."], "notes": "..."}
614"#;
615
616async fn execute_judge(
617 report: &PromptReport,
618 judge_template: Option<&Path>,
619 judge_model: &str,
620 max_tokens: i64,
621) -> Result<JudgeReport, i32> {
622 let judge_template_body = match judge_template {
623 Some(path) => fs::read_to_string(path).map_err(|error| {
624 eprintln!(
625 "error: failed to read judge template {}: {error}",
626 path.display()
627 );
628 1i32
629 })?,
630 None => DEFAULT_JUDGE_TEMPLATE.to_string(),
631 };
632 let prompt_source = fs::read_to_string(&report.template_path).unwrap_or_default();
633
634 let entries: Vec<JudgeEntry> = report
635 .renders
636 .iter()
637 .map(|r| JudgeEntry {
638 selector: r.selector.clone(),
639 provider: r.provider.clone(),
640 family: r.family.clone(),
641 rendered: r.rendered.clone().unwrap_or_default(),
642 response: report
643 .runs
644 .get(&r.selector)
645 .and_then(|run| run.response.clone())
646 .unwrap_or_else(|| "<no response>".to_string()),
647 })
648 .collect();
649
650 let entries_json = serde_json::to_string(&entries).unwrap_or_else(|_| "[]".to_string());
651 let template_lit = json_string_literal(&judge_template_body);
652 let entries_lit = json_string_literal(&entries_json);
653 let source_lit = json_string_literal(&prompt_source);
654
655 let resolved_judge = llm_config::resolve_model_info(judge_model);
656 let provider_lit = json_string_literal(&resolved_judge.provider);
657 let model_lit = json_string_literal(&resolved_judge.id);
658
659 let script = format!(
660 "pipeline main() {{\n\
661 let entries = json_parse({entries_lit})\n\
662 let prompt = render_string({template_lit}, {{\n\
663 template_source: {source_lit},\n\
664 entries: entries\n\
665 }})\n\
666 let verdict = llm_call(prompt, nil, {{\n\
667 provider: {provider_lit},\n\
668 model: {model_lit},\n\
669 max_tokens: {max_tokens}\n\
670 }})\n\
671 println(verdict)\n\
672}}\n",
673 );
674
675 let verdict = match invoke_harn_script(&script).await {
676 Ok(out) => out.trim().to_string(),
677 Err(err) => {
678 eprintln!("error: judge-mode harn script failed: {err}");
679 return Err(1);
680 }
681 };
682
683 Ok(JudgeReport {
684 judge_model: judge_model.to_string(),
685 verdict,
686 })
687}
688
689#[derive(Debug, serde::Serialize)]
690struct JudgeEntry {
691 selector: String,
692 provider: String,
693 family: String,
694 rendered: String,
695 response: String,
696}
697
698fn render_terminal(report: &PromptReport) -> String {
701 let mut out = String::new();
702 out.push_str(&format!(
703 "# harn eval prompt — {} (mode: {})\n\n",
704 report.template_path.display(),
705 report.mode,
706 ));
707
708 let baseline_lines: Option<Vec<&str>> = report
709 .renders
710 .iter()
711 .find_map(|r| r.rendered.as_deref())
712 .map(|s| s.lines().collect());
713
714 for (idx, render) in report.renders.iter().enumerate() {
715 out.push_str(&format!(
716 "## [{idx}] {} ({}/{}) family={}\n",
717 render.selector, render.provider, render.model, render.family,
718 ));
719 if !render.auth_available {
720 out.push_str(" auth: not configured\n");
721 }
722 if let Some(error) = render.error.as_ref() {
723 out.push_str(&format!(" render error: {error}\n\n"));
724 continue;
725 }
726 let Some(rendered) = render.rendered.as_deref() else {
727 continue;
728 };
729 out.push_str("---\n");
730 out.push_str(rendered);
731 if !rendered.ends_with('\n') {
732 out.push('\n');
733 }
734 out.push_str("---\n");
735 if idx > 0 {
738 if let Some(baseline) = baseline_lines.as_deref() {
739 let summary = line_diff_summary(baseline, &rendered.lines().collect::<Vec<_>>());
740 if !summary.is_empty() {
741 out.push_str(&format!(" diff vs #0: {summary}\n"));
742 }
743 }
744 }
745 out.push('\n');
746 }
747
748 if let Some(context_eval) = report.context_eval.as_ref() {
749 out.push_str(&format!(
750 "\n# Context fixture gates: {} passed / {} total\n",
751 context_eval.passed, context_eval.total,
752 ));
753 for fixture in &context_eval.fixtures {
754 out.push_str(&format!(
755 "\n## {} ({} passed / {} total)\n",
756 fixture.path.display(),
757 fixture.passed,
758 fixture.total,
759 ));
760 for case in &fixture.cases {
761 out.push_str(&format!(
762 "- {}: {} score={:.3} selected=[{}] tokens={}/{}\n",
763 case.id,
764 if case.pass { "pass" } else { "fail" },
765 case.score.overall,
766 case.selected_artifact_ids.join(", "),
767 case.budget.total_tokens,
768 case.budget.budget_tokens,
769 ));
770 for failure in &case.failures {
771 out.push_str(&format!(" failure: {failure}\n"));
772 }
773 }
774 }
775 }
776
777 if !report.runs.is_empty() {
778 out.push_str("\n# Model responses\n");
779 for render in &report.renders {
780 let Some(run) = report.runs.get(&render.selector) else {
781 continue;
782 };
783 out.push_str(&format!("\n## {} ({})\n", render.selector, render.model));
784 if run.skipped {
785 out.push_str(" skipped: unauthenticated provider\n");
786 continue;
787 }
788 if let Some(error) = run.error.as_ref() {
789 out.push_str(&format!(" error: {error}\n"));
790 continue;
791 }
792 if let Some(response) = run.response.as_deref() {
793 out.push_str("---\n");
794 out.push_str(response);
795 if !response.ends_with('\n') {
796 out.push('\n');
797 }
798 out.push_str("---\n");
799 }
800 }
801 }
802
803 if let Some(judge) = report.judge.as_ref() {
804 out.push_str(&format!(
805 "\n# Judge verdict ({}): \n{}\n",
806 judge.judge_model, judge.verdict,
807 ));
808 }
809
810 out
811}
812
813fn line_diff_summary(baseline: &[&str], candidate: &[&str]) -> String {
818 let baseline_set: BTreeSet<&str> = baseline.iter().copied().collect();
819 let candidate_set: BTreeSet<&str> = candidate.iter().copied().collect();
820 let only_in_baseline = baseline_set.difference(&candidate_set).count();
821 let only_in_candidate = candidate_set.difference(&baseline_set).count();
822 if only_in_baseline == 0 && only_in_candidate == 0 {
823 let total_baseline = baseline.len();
824 let total_candidate = candidate.len();
825 if total_baseline == total_candidate {
826 String::new()
827 } else {
828 format!(
829 "{} vs {} lines (same content set, different ordering or repeats)",
830 total_baseline, total_candidate,
831 )
832 }
833 } else {
834 format!(
835 "{} line(s) only in baseline, {} line(s) only here",
836 only_in_baseline, only_in_candidate,
837 )
838 }
839}
840
841fn render_json(report: &PromptReport) -> String {
842 match serde_json::to_string_pretty(report) {
843 Ok(s) => format!("{s}\n"),
844 Err(error) => format!("{{\"error\": \"serialize: {error}\"}}\n"),
845 }
846}
847
848fn render_html(report: &PromptReport) -> String {
849 let mut out = String::new();
850 out.push_str(
851 "<!doctype html><html><head><meta charset=\"utf-8\"><title>harn eval prompt report</title>",
852 );
853 out.push_str(
854 "<style>body{font-family:system-ui,sans-serif;margin:2rem;color:#222}h1{margin-bottom:0}",
855 );
856 out.push_str(".meta{color:#666;font-size:0.9rem;margin-bottom:1.5rem}");
857 out.push_str(
858 ".grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(28rem,1fr));gap:1rem}",
859 );
860 out.push_str(".card{border:1px solid #ddd;border-radius:6px;padding:1rem;background:#fafafa}");
861 out.push_str(".card h2{margin-top:0;font-size:1rem}");
862 out.push_str("pre{background:#fff;border:1px solid #eee;padding:0.75rem;overflow:auto;white-space:pre-wrap;font-size:0.85rem}");
863 out.push_str(".err{color:#b00}.skip{color:#888;font-style:italic}");
864 out.push_str("</style></head><body>");
865 out.push_str(&format!(
866 "<h1>harn eval prompt</h1><div class=\"meta\">{} · mode: {}</div>",
867 html_escape(&report.template_path.to_string_lossy()),
868 report.mode,
869 ));
870 out.push_str("<div class=\"grid\">");
871 for render in &report.renders {
872 out.push_str(&format!(
873 "<div class=\"card\"><h2>{} <span class=\"meta\">({} / {} · {})</span></h2>",
874 html_escape(&render.selector),
875 html_escape(&render.provider),
876 html_escape(&render.model),
877 html_escape(&render.family),
878 ));
879 if !render.auth_available {
880 out.push_str("<p class=\"skip\">auth: not configured</p>");
881 }
882 match (&render.rendered, &render.error) {
883 (_, Some(error)) => {
884 out.push_str(&format!(
885 "<p class=\"err\">render error: {}</p>",
886 html_escape(error)
887 ));
888 }
889 (Some(rendered), _) => {
890 out.push_str(&format!("<pre>{}</pre>", html_escape(rendered)));
891 }
892 _ => {}
893 }
894 if let Some(run) = report.runs.get(&render.selector) {
895 if run.skipped {
896 out.push_str("<p class=\"skip\">run: skipped (no credentials)</p>");
897 } else if let Some(err) = run.error.as_ref() {
898 out.push_str(&format!(
899 "<p class=\"err\">run error: {}</p>",
900 html_escape(err)
901 ));
902 } else if let Some(response) = run.response.as_ref() {
903 out.push_str("<h3>response</h3>");
904 out.push_str(&format!("<pre>{}</pre>", html_escape(response)));
905 }
906 }
907 out.push_str("</div>");
908 }
909 out.push_str("</div>");
910 if let Some(context_eval) = report.context_eval.as_ref() {
911 out.push_str(&format!(
912 "<h2>Context fixture gates</h2><p>{} passed / {} total</p>",
913 context_eval.passed, context_eval.total,
914 ));
915 for fixture in &context_eval.fixtures {
916 out.push_str(&format!(
917 "<h3>{}</h3><ul>",
918 html_escape(&fixture.path.to_string_lossy()),
919 ));
920 for case in &fixture.cases {
921 out.push_str(&format!(
922 "<li><strong>{}</strong>: {} · score {:.3} · selected [{}] · tokens {}/{}</li>",
923 html_escape(&case.id),
924 if case.pass { "pass" } else { "fail" },
925 case.score.overall,
926 html_escape(&case.selected_artifact_ids.join(", ")),
927 case.budget.total_tokens,
928 case.budget.budget_tokens,
929 ));
930 }
931 out.push_str("</ul>");
932 }
933 }
934 if let Some(judge) = report.judge.as_ref() {
935 out.push_str(&format!(
936 "<h2>Judge ({})</h2><pre>{}</pre>",
937 html_escape(&judge.judge_model),
938 html_escape(&judge.verdict),
939 ));
940 }
941 out.push_str("</body></html>\n");
942 out
943}
944
945fn html_escape(s: &str) -> String {
946 let mut out = String::with_capacity(s.len());
947 for c in s.chars() {
948 match c {
949 '&' => out.push_str("&"),
950 '<' => out.push_str("<"),
951 '>' => out.push_str(">"),
952 '"' => out.push_str("""),
953 '\'' => out.push_str("'"),
954 _ => out.push(c),
955 }
956 }
957 out
958}
959
960#[cfg(test)]
961mod tests {
962 use super::*;
963
964 #[test]
965 fn fleet_resolution_dedupes_and_expands_aliases() {
966 let args = EvalPromptArgs {
967 file: PathBuf::from("/tmp/missing.harn.prompt"),
968 fleet: vec![
969 "claude-3-5-sonnet".to_string(),
970 "claude-3-5-sonnet".to_string(),
971 "ollama:qwen3.5".to_string(),
972 ],
973 fleet_name: None,
974 bindings: None,
975 context_fixture: Vec::new(),
976 mode: EvalPromptMode::Render,
977 output: EvalPromptOutput::Terminal,
978 out_file: None,
979 max_concurrent: 1,
980 judge_template: None,
981 judge_model: "claude-opus-4-7".to_string(),
982 max_tokens: 256,
983 fail_on_unauthorized: false,
984 };
985 let entries = resolve_fleet(&args, Path::new("/tmp")).expect("resolve");
986 assert_eq!(entries.len(), 2);
987 assert_eq!(entries[0].selector, "claude-3-5-sonnet");
988 assert_eq!(entries[1].selector, "ollama:qwen3.5");
989 assert_eq!(entries[1].provider, "ollama");
990 assert_eq!(entries[1].model, "qwen3.5");
991 }
992
993 #[test]
994 fn render_fleet_emits_per_capability_envelope() {
995 let template = "{{ if llm.capabilities.native_tools }}native{{ else }}text{{ end }}\n";
996 let fleet = vec![FleetEntry {
997 selector: "ollama:qwen3.5".to_string(),
998 provider: "ollama".to_string(),
999 model: "qwen3.5".to_string(),
1000 }];
1001 let renders = render_fleet(&fleet, template, Path::new("/tmp/x.harn.prompt"), None);
1002 assert_eq!(renders.len(), 1);
1003 assert!(renders[0].error.is_none(), "{:?}", renders[0].error);
1004 assert!(renders[0].rendered.is_some());
1005 }
1006
1007 #[test]
1008 fn line_diff_summary_reports_unique_lines() {
1009 let baseline = vec!["a", "b", "c"];
1010 let candidate = vec!["a", "b", "d"];
1011 let summary = line_diff_summary(&baseline, &candidate);
1012 assert!(summary.contains("1 line(s) only in baseline"));
1013 assert!(summary.contains("1 line(s) only here"));
1014 }
1015
1016 #[test]
1017 fn line_diff_summary_quiet_on_identical() {
1018 let baseline = vec!["a", "b", "c"];
1019 let candidate = vec!["a", "b", "c"];
1020 assert_eq!(line_diff_summary(&baseline, &candidate), "");
1021 }
1022}