1use crate::registry::RegisteredAgent;
17use crate::runner::AgentRunResult;
18use crate::safety_pipeline::{
19 execute_candidate_edit, CandidateExecutionConfig, CandidateExecutionContext,
20};
21use crate::{
22 diagnose_run, split_dataset, EvaluationDataset, ExperimentLedger, FailureKind, HookDecision,
23 HookPolicy, OptimizationBudget, PromptVariantRecord, ScorerMetadata, TraceDiagnosis,
24};
25use mdx_rust_analysis::editing::ProposedEdit;
26use mdx_rust_analysis::editing::ValidationCommandRecord;
27use mdx_rust_analysis::AgentBundle;
28use schemars::JsonSchema;
29use serde::{Deserialize, Serialize};
30use std::path::{Path, PathBuf};
31use std::time::Duration;
32
33fn generate_preamble_patch(file_path: &Path, source: &str, old: &str, new: &str) -> String {
36 let diff_path = file_path.to_string_lossy();
37
38 if !source.contains(old) {
39 return format!(
41 "diff --git a/{diff_path} b/{diff_path}\n--- a/{diff_path}\n+++ b/{diff_path}\n@@ -1,1 +1,1 @@\n-{old}\n+{new}\n"
42 );
43 }
44
45 let lines: Vec<&str> = source.lines().collect();
46 let mut patch_lines = Vec::new();
47 patch_lines.push(format!("diff --git a/{diff_path} b/{diff_path}"));
48 patch_lines.push(format!("--- a/{diff_path}"));
49 patch_lines.push(format!("+++ b/{diff_path}"));
50
51 let mut hunk_start = 0usize;
53 let mut old_line_idx = None;
54 for (i, line) in lines.iter().enumerate() {
55 if line.contains(old) {
56 old_line_idx = Some(i);
57 hunk_start = i.saturating_sub(3);
58 break;
59 }
60 }
61
62 if let Some(idx) = old_line_idx {
63 let context_before = &lines[hunk_start..idx];
64 let context_after = if idx + 1 < lines.len() {
65 &lines[idx + 1..(idx + 1 + 3).min(lines.len())]
66 } else {
67 &[][..]
68 };
69
70 let new_line = lines[idx].replace(old, new);
71
72 let hunk_header = format!(
73 "@@ -{},{} +{},{} @@",
74 hunk_start + 1,
75 context_before.len() + 1 + context_after.len(),
76 hunk_start + 1,
77 context_before.len() + 1 + context_after.len()
78 );
79 patch_lines.push(hunk_header);
80
81 for l in context_before {
82 patch_lines.push(format!(" {}", l));
83 }
84 patch_lines.push(format!("-{}", lines[idx]));
85 patch_lines.push(format!("+{}", new_line));
86 for l in context_after {
87 patch_lines.push(format!(" {}", l));
88 }
89 } else {
90 patch_lines.push("@@ -1,1 +1,1 @@".to_string());
92 patch_lines.push(format!("-{}", old));
93 patch_lines.push(format!("+{}", new));
94 }
95
96 patch_lines.join("\n")
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
101pub struct OptimizeConfig {
102 pub max_iterations: u32,
103 pub candidates_per_iteration: u32,
104 pub use_llm_judge: bool,
105 #[serde(default)]
106 pub budget: OptimizationBudget,
107 #[serde(default)]
108 pub hook_policy: HookPolicy,
109 #[serde(default)]
111 pub review_before_apply: bool,
112 #[serde(default)]
114 pub quiet: bool,
115 #[serde(skip, default = "default_candidate_timeout")]
116 #[schemars(skip)]
117 pub candidate_timeout: Duration,
118}
119
120#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
122pub struct OptimizationRun {
123 pub iteration: u32,
124 pub scores: Vec<f32>,
125 pub validated_changes: u32,
127 pub landed_changes: u32,
129 pub accepted_changes: u32,
131 pub notes: String,
132 pub candidates: Vec<Candidate>,
133 #[serde(default)]
135 pub diff: Option<String>,
136 #[serde(default)]
137 pub policy_hash: Option<String>,
138 #[serde(default)]
139 pub dataset_version: Option<String>,
140 #[serde(default)]
141 pub dataset_hash: Option<String>,
142 #[serde(default)]
144 pub baseline_score: Option<f32>,
145 #[serde(default)]
146 pub patched_score: Option<f32>,
147 #[serde(default)]
148 pub score_delta: Option<f32>,
149
150 #[serde(default)]
152 pub git_sha_before: Option<String>,
153 #[serde(default)]
154 pub git_sha_after: Option<String>,
155 #[serde(default)]
156 pub diff_hash: Option<String>,
157 #[serde(default)]
158 pub working_tree_dirty_after: Option<bool>,
159 #[serde(default)]
160 pub scorer: Option<String>,
161 #[serde(default)]
162 pub validation_commands: Option<Vec<String>>,
163 #[serde(default)]
164 pub validation_command_records: Vec<ValidationCommandRecord>,
165 #[serde(default)]
166 pub final_validation_command_records: Vec<ValidationCommandRecord>,
167 #[serde(default)]
168 pub trace_diagnosis: Vec<TraceDiagnosis>,
169 #[serde(default)]
170 pub hook_decisions: Vec<HookDecision>,
171 #[serde(default)]
172 pub ledger: Option<ExperimentLedger>,
173 #[serde(default)]
174 pub holdout_score: Option<f32>,
175 #[serde(default)]
176 pub budget: Option<OptimizationBudget>,
177 #[serde(default)]
178 pub policy_path: Option<String>,
179 #[serde(default)]
180 pub model: Option<ModelProvenance>,
181 #[serde(default)]
182 pub rollback_succeeded: Option<bool>,
183 #[serde(default)]
184 pub rollback_error: Option<String>,
185 #[serde(default)]
186 pub candidate_timed_out: bool,
187 #[serde(default)]
189 pub audit_packet: Option<AuditPacket>,
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
193pub struct ModelProvenance {
194 pub role: String,
195 pub provider: String,
196 pub model: String,
197 pub used: bool,
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
206pub struct AuditPacket {
207 pub schema_version: String,
208 pub agent_name: String,
209 pub iteration: u32,
210 pub edit_scope_contract: String,
211 pub accepted_edit: AcceptedEditSummary,
212 pub provenance: AuditProvenance,
213 pub scores: ScoreProvenance,
214 pub hook_decisions: Vec<HookDecision>,
215 pub validation_command_records: Vec<ValidationCommandRecord>,
216 pub final_validation_command_records: Vec<ValidationCommandRecord>,
217 pub rollback_succeeded: Option<bool>,
218 pub rollback_error: Option<String>,
219 pub candidate_timed_out: bool,
220}
221
222#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
224pub struct AcceptedEditSummary {
225 pub description: String,
226 pub changed_file: String,
227 pub diff_hash: String,
228 pub diff: String,
229}
230
231#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
233pub struct AuditProvenance {
234 pub git_sha_before: Option<String>,
235 pub git_sha_after: Option<String>,
236 pub working_tree_dirty_after: Option<bool>,
237 pub policy_path: Option<String>,
238 pub policy_hash: Option<String>,
239 pub dataset_version: String,
240 pub dataset_hash: String,
241 pub scorer_id: String,
242 pub scorer_version: String,
243 pub model: ModelProvenance,
244}
245
246#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
248pub struct ScoreProvenance {
249 pub baseline_score: f32,
250 pub patched_score: f32,
251 pub score_delta: f32,
252 pub holdout_score: Option<f32>,
253}
254
255#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
257pub struct Candidate {
258 pub focus: String, pub description: String,
260 pub expected_improvement: String,
261 #[serde(default)]
262 pub strategy: Option<EditStrategy>,
263}
264
265#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq)]
266pub enum EditStrategy {
267 SystemPrompt,
268 ToolDescription,
269 FallbackLogic,
270 OutputSchema,
271 ModelConfig,
272}
273
274fn default_candidate_timeout() -> Duration {
275 Duration::from_secs(300)
276}
277
278pub async fn run_optimization(
285 agent: &RegisteredAgent,
286 config: &OptimizeConfig,
287) -> anyhow::Result<Vec<OptimizationRun>> {
288 let mut runs = vec![];
289
290 let dataset = EvaluationDataset::synthetic_v1();
291 let split = split_dataset(&dataset, config.budget);
292 let mut ledger = ExperimentLedger::new(config.budget, &dataset, &split);
293 let dataset_hash = dataset.content_hash();
294 let scorer = ScorerMetadata::mechanical_v1();
295 let test_inputs: Vec<serde_json::Value> = split
296 .train
297 .iter()
298 .map(|sample| sample.input.clone())
299 .collect();
300 let holdout_inputs: Vec<serde_json::Value> = split
301 .holdout
302 .iter()
303 .map(|sample| sample.input.clone())
304 .collect();
305
306 let baseline_score: f32 = {
308 let mut total = 0.0f32;
309 for input in &test_inputs {
310 if let Ok(res) = crate::runner::run_agent(agent, input.clone()).await {
311 total += mechanical_score(&res);
312 }
313 }
314 if test_inputs.is_empty() {
315 0.0
316 } else {
317 total / test_inputs.len() as f32
318 }
319 };
320
321 let git_sha_before: Option<String> = std::process::Command::new("git")
323 .current_dir(&agent.path)
324 .args(["rev-parse", "--short", "HEAD"])
325 .output()
326 .ok()
327 .and_then(|o| {
328 if o.status.success() {
329 Some(String::from_utf8_lossy(&o.stdout).trim().to_string())
330 } else {
331 None
332 }
333 });
334 let policy_info = load_policy_info(&agent.name);
335
336 for iteration in 0..config.max_iterations {
337 let mut scores_this_iter = vec![];
338 let mut accepted_patched: Option<f32> = None;
339 let mut accepted_delta: Option<f32> = None;
340 let mut validated = 0;
341 let mut landed = 0;
342 let mut trace_diagnoses = Vec::new();
343 let mut hook_decisions = Vec::new();
344 let mut accepted_holdout_score = None;
345 let mut accepted_validation_commands = Vec::new();
346 let mut accepted_final_validation_commands = Vec::new();
347 let mut accepted_rollback_succeeded = None;
348 let mut accepted_rollback_error = None;
349 let mut accepted_edit_description: Option<String> = None;
350 let mut accepted_edit_file: Option<String> = None;
351 let mut any_candidate_timed_out = false;
352
353 for input in &test_inputs {
354 let run_result = crate::runner::run_agent(agent, input.clone()).await?;
355 trace_diagnoses.push(diagnose_run(&run_result));
356 let score = mechanical_score(&run_result);
357 scores_this_iter.push(score);
358 }
359
360 let avg_score: f32 = if scores_this_iter.is_empty() {
361 0.0
362 } else {
363 scores_this_iter.iter().sum::<f32>() / scores_this_iter.len() as f32
364 };
365
366 let rich_bundle = mdx_rust_analysis::analyze_agent(&agent.path, None).ok();
368 let file_count = rich_bundle
369 .as_ref()
370 .map(|b| b.scope.optimizable_paths.len())
371 .unwrap_or(0);
372
373 let bundle_summary = if let Some(ref b) = rich_bundle {
375 let mut s = format!(
376 "{} source files, Rig agent = {}",
377 file_count, b.is_rig_agent
378 );
379 if !b.preambles.is_empty() {
380 s.push_str(&format!(
381 ", current preambles: {:?}",
382 b.preambles.iter().map(|p| &p.text).collect::<Vec<_>>()
383 ));
384 }
385 if !b.tools.is_empty() {
386 s.push_str(&format!(
387 ", tools: {:?}",
388 b.tools.iter().map(|t| &t.name).collect::<Vec<_>>()
389 ));
390 }
391 s
392 } else {
393 format!("{} source files (limited analysis)", file_count)
394 };
395
396 let llm = crate::llm::LlmClient::default();
397 let diag_req = crate::llm::DiagnosisRequest {
398 policy: "Improve the agent so it gives high-quality, reasoned answers instead of echoing. Prefer explicit step-by-step reasoning in the system prompt.".to_string(),
399 bundle_summary,
400 traces_summary: summarize_trace_diagnoses(&trace_diagnoses),
401 scores: scores_this_iter.clone(),
402 };
403
404 let diagnosis_result = llm.diagnose(diag_req).await;
405 let diagnosis_model_used = diagnosis_result.is_ok();
406 let diagnosis = diagnosis_result.ok();
407
408 let mut candidates = vec![];
409 let mut accepted = 0;
410 let mut notes = format!(
411 "Avg score this iter: {:.2} ({} files in bundle)",
412 avg_score, file_count
413 );
414 let mut accepted_diff: Option<String> = None;
415
416 if let Some(d) = diagnosis {
417 notes.push_str(&format!(" → LLM: {}", d.summary));
418 for c in d.candidates {
419 let strategy = strategy_for_focus(&c.focus);
420 candidates.push(Candidate {
421 focus: c.focus,
422 description: c.description,
423 expected_improvement: c.expected_improvement,
424 strategy: Some(strategy),
425 });
426 }
427 } else {
428 candidates = fallback_candidates_from_trace(&trace_diagnoses);
429 }
430
431 if !candidates.is_empty() {
432 let candidate_limit = config
433 .budget
434 .candidate_limit(config.candidates_per_iteration);
435 for (candidate_index, candidate) in candidates.iter().take(candidate_limit).enumerate()
436 {
437 if accepted > 0 {
438 break;
439 }
440
441 let Some(edit) =
442 build_edit_for_candidate(&agent.path, rich_bundle.as_ref(), candidate)?
443 else {
444 notes.push_str(&format!(
445 " (candidate {} skipped: no safe edit plan for {:?})",
446 candidate.focus, candidate.strategy
447 ));
448 continue;
449 };
450
451 notes.push_str(&format!(
452 " → Candidate {}: {} ({:?})",
453 candidate_index + 1,
454 candidate.focus,
455 candidate.strategy
456 ));
457
458 ledger.record_variant(PromptVariantRecord::from_patch(
459 format!("{:?}", candidate.strategy),
460 edit.file.display().to_string(),
461 edit.description.clone(),
462 &edit.patch,
463 ));
464
465 let outcome = execute_candidate_edit(CandidateExecutionContext {
466 agent,
467 config: CandidateExecutionConfig {
468 hook_policy: &config.hook_policy,
469 review_before_apply: config.review_before_apply,
470 quiet: config.quiet,
471 candidate_timeout: config.candidate_timeout,
472 },
473 iteration,
474 candidate_index,
475 edit: &edit,
476 test_inputs: &test_inputs,
477 holdout_inputs: &holdout_inputs,
478 baseline_score,
479 scorer: mechanical_score,
480 })
481 .await;
482
483 validated += outcome.validated;
484 landed += outcome.landed;
485 any_candidate_timed_out |= outcome.timed_out;
486 hook_decisions.extend(outcome.hook_decisions);
487
488 if outcome.accepted > 0 {
489 accepted = outcome.accepted;
490 accepted_diff = outcome.accepted_diff;
491 accepted_patched = outcome.patched_score;
492 accepted_delta = outcome.delta;
493 accepted_holdout_score = outcome.holdout_score;
494 accepted_validation_commands = outcome.validation_commands;
495 accepted_final_validation_commands = outcome.final_validation_commands;
496 accepted_rollback_succeeded = outcome.rollback_succeeded;
497 accepted_rollback_error = outcome.rollback_error;
498 accepted_edit_description = Some(edit.description.clone());
499 accepted_edit_file = Some(edit.file.display().to_string());
500 }
501
502 notes.push_str(&outcome.note);
503 }
504 } else {
505 accepted = 0; notes.push_str(" → No new candidates — current behavior is good (no change applied)");
507 }
508
509 let (run_baseline, run_patched, run_delta) = if accepted > 0 {
510 (Some(baseline_score), accepted_patched, accepted_delta)
511 } else {
512 (None, None, None)
513 };
514
515 let (prov_before, prov_after, prov_diff_hash, prov_dirty, prov_scorer, prov_cmds) =
517 if accepted > 0 {
518 let after = std::process::Command::new("git")
519 .current_dir(&agent.path)
520 .args(["rev-parse", "--short", "HEAD"])
521 .output()
522 .ok()
523 .and_then(|o| {
524 if o.status.success() {
525 Some(String::from_utf8_lossy(&o.stdout).trim().to_string())
526 } else {
527 None
528 }
529 });
530 let dirty_after = std::process::Command::new("git")
531 .current_dir(&agent.path)
532 .args(["status", "--porcelain"])
533 .output()
534 .ok()
535 .filter(|output| output.status.success())
536 .map(|output| !output.stdout.is_empty());
537
538 (
539 git_sha_before.clone(),
540 after,
541 accepted_diff
542 .as_ref()
543 .map(|diff| stable_hash_hex(diff.as_bytes())),
544 dirty_after,
545 Some(scorer.label()),
546 Some(vec![
547 "cargo check (isolated)".to_string(),
548 "cargo clippy -D warnings (isolated)".to_string(),
549 "final validate_build after land (real tree)".to_string(),
550 ]),
551 )
552 } else {
553 (None, None, None, None, None, None)
554 };
555
556 let model_provenance = llm.provenance(diagnosis_model_used);
557 let audit_packet = if accepted > 0 {
558 build_audit_packet(AuditPacketInput {
559 agent_name: &agent.name,
560 iteration,
561 edit_description: accepted_edit_description.as_deref(),
562 edit_file: accepted_edit_file.as_deref(),
563 diff: accepted_diff.as_deref(),
564 diff_hash: prov_diff_hash.as_deref(),
565 git_sha_before: prov_before.clone(),
566 git_sha_after: prov_after.clone(),
567 working_tree_dirty_after: prov_dirty,
568 policy_path: policy_info
569 .as_ref()
570 .map(|policy| policy.path.display().to_string()),
571 policy_hash: policy_info.as_ref().map(|policy| policy.hash.clone()),
572 dataset_version: &dataset.version,
573 dataset_hash: &dataset_hash,
574 scorer: &scorer,
575 model: model_provenance.clone(),
576 baseline_score,
577 patched_score: accepted_patched,
578 score_delta: accepted_delta,
579 holdout_score: accepted_holdout_score,
580 hook_decisions: hook_decisions.clone(),
581 validation_command_records: accepted_validation_commands.clone(),
582 final_validation_command_records: accepted_final_validation_commands.clone(),
583 rollback_succeeded: accepted_rollback_succeeded,
584 rollback_error: accepted_rollback_error.clone(),
585 candidate_timed_out: any_candidate_timed_out,
586 })
587 } else {
588 None
589 };
590
591 runs.push(OptimizationRun {
592 iteration,
593 scores: scores_this_iter,
594 validated_changes: validated,
595 landed_changes: landed,
596 accepted_changes: accepted,
597 notes,
598 candidates,
599 diff: accepted_diff,
600 policy_hash: policy_info.as_ref().map(|policy| policy.hash.clone()),
601 dataset_version: Some(dataset.version.clone()),
602 dataset_hash: Some(dataset_hash.clone()),
603 baseline_score: run_baseline,
604 patched_score: run_patched,
605 score_delta: run_delta,
606 git_sha_before: prov_before,
607 git_sha_after: prov_after,
608 diff_hash: prov_diff_hash,
609 working_tree_dirty_after: prov_dirty,
610 scorer: prov_scorer,
611 validation_commands: prov_cmds,
612 validation_command_records: accepted_validation_commands,
613 final_validation_command_records: accepted_final_validation_commands,
614 trace_diagnosis: trace_diagnoses,
615 hook_decisions,
616 ledger: Some(ledger.clone()),
617 holdout_score: accepted_holdout_score,
618 budget: Some(config.budget),
619 policy_path: policy_info
620 .as_ref()
621 .map(|policy| policy.path.display().to_string()),
622 model: Some(model_provenance),
623 rollback_succeeded: accepted_rollback_succeeded,
624 rollback_error: accepted_rollback_error,
625 candidate_timed_out: any_candidate_timed_out,
626 audit_packet,
627 });
628
629 if accepted > 0 && iteration > 0 {
630 }
632 }
633
634 let experiment_dir = std::env::current_dir()?
636 .join(".mdx-rust")
637 .join("agents")
638 .join(&agent.name)
639 .join("experiments");
640
641 std::fs::create_dir_all(&experiment_dir).ok();
642
643 let timestamp = std::time::SystemTime::now()
644 .duration_since(std::time::UNIX_EPOCH)
645 .map(|d| d.as_secs())
646 .unwrap_or(0);
647
648 let experiment_file = experiment_dir.join(format!("run-{}.json", timestamp));
649 if let Ok(content) = serde_json::to_string_pretty(&runs) {
650 let _ = std::fs::write(experiment_file, content);
651 }
652
653 for run in &runs {
654 if let Some(packet) = &run.audit_packet {
655 let audit_file = experiment_dir.join(format!(
656 "audit-packet-{}-iteration-{}.json",
657 timestamp, run.iteration
658 ));
659 if let Ok(content) = serde_json::to_string_pretty(packet) {
660 let _ = std::fs::write(audit_file, content);
661 }
662 }
663 }
664
665 if runs.iter().any(|r| r.accepted_changes > 0) {
667 let git_sha = std::process::Command::new("git")
668 .args(["rev-parse", "--short", "HEAD"])
669 .output()
670 .ok()
671 .and_then(|o| String::from_utf8(o.stdout).ok())
672 .map(|s| s.trim().to_string())
673 .unwrap_or_else(|| "unknown".to_string());
674
675 let mut report = format!(
676 "# Optimization Report for '{}'\n\nTimestamp: {}\nGit SHA: {}\n\n## Summary\n\n",
677 agent.name, timestamp, git_sha
678 );
679
680 for run in &runs {
681 if run.accepted_changes > 0 {
682 report.push_str(&format!(
683 "- Iteration {}: Accepted {} change(s)\n Notes: {}\n",
684 run.iteration, run.accepted_changes, run.notes
685 ));
686
687 if let Some(d) = &run.diff {
688 report.push_str(&format!("\n```diff\n{}\n```\n", d));
689 } else {
690 report.push_str(" (Change persisted to src/main.rs)\n");
691 }
692
693 if let Some(h) = &run.policy_hash {
694 report.push_str(&format!(" Policy hash: {}\n", h));
695 }
696 if let Some(v) = &run.dataset_version {
697 report.push_str(&format!(" Dataset version: {}\n", v));
698 }
699 if let Some(path) = &run.policy_path {
700 report.push_str(&format!(" Policy path: {}\n", path));
701 }
702 if let Some(model) = &run.model {
703 report.push_str(&format!(
704 " Diagnosis model: {}:{} (used={})\n",
705 model.provider, model.model, model.used
706 ));
707 }
708 if !run.validation_command_records.is_empty() {
709 report.push_str(" Isolated validation commands:\n");
710 for command in &run.validation_command_records {
711 report.push_str(&format!(
712 " - {} | success={} | timeout={} | status={:?} | duration_ms={}\n",
713 command.command,
714 command.success,
715 command.timed_out,
716 command.status_code,
717 command.duration_ms
718 ));
719 }
720 }
721 if !run.final_validation_command_records.is_empty() {
722 report.push_str(" Final validation commands:\n");
723 for command in &run.final_validation_command_records {
724 report.push_str(&format!(
725 " - {} | success={} | timeout={} | status={:?} | duration_ms={}\n",
726 command.command,
727 command.success,
728 command.timed_out,
729 command.status_code,
730 command.duration_ms
731 ));
732 }
733 }
734 }
735 }
736
737 report.push_str("\n## Candidates Considered\n\n");
738 for run in &runs {
739 for (i, c) in run.candidates.iter().enumerate() {
740 report.push_str(&format!(
741 "- [{}] {}: {}\n Expected: {}\n\n",
742 i + 1,
743 c.focus,
744 c.description,
745 c.expected_improvement
746 ));
747 }
748 }
749
750 let _ = std::fs::write(
751 experiment_dir.join(format!("report-{}.md", timestamp)),
752 report,
753 );
754 }
755
756 if runs.iter().any(|r| r.accepted_changes > 0) {
758 let mut final_scores = vec![];
759 for input in &test_inputs {
760 if let Ok(res) = crate::runner::run_agent(agent, input.clone()).await {
761 final_scores.push(mechanical_score(&res));
762 }
763 }
764 if !final_scores.is_empty() {
765 let final_avg = final_scores.iter().sum::<f32>() / final_scores.len() as f32;
766 if !config.quiet {
767 println!(
768 " Final re-evaluation after accepted changes: {:.2}",
769 final_avg
770 );
771 }
772 }
773 }
774
775 Ok(runs)
776}
777
778#[derive(Debug, Clone)]
779struct PolicyInfo {
780 path: PathBuf,
781 hash: String,
782}
783
784struct AuditPacketInput<'a> {
785 agent_name: &'a str,
786 iteration: u32,
787 edit_description: Option<&'a str>,
788 edit_file: Option<&'a str>,
789 diff: Option<&'a str>,
790 diff_hash: Option<&'a str>,
791 git_sha_before: Option<String>,
792 git_sha_after: Option<String>,
793 working_tree_dirty_after: Option<bool>,
794 policy_path: Option<String>,
795 policy_hash: Option<String>,
796 dataset_version: &'a str,
797 dataset_hash: &'a str,
798 scorer: &'a ScorerMetadata,
799 model: ModelProvenance,
800 baseline_score: f32,
801 patched_score: Option<f32>,
802 score_delta: Option<f32>,
803 holdout_score: Option<f32>,
804 hook_decisions: Vec<HookDecision>,
805 validation_command_records: Vec<ValidationCommandRecord>,
806 final_validation_command_records: Vec<ValidationCommandRecord>,
807 rollback_succeeded: Option<bool>,
808 rollback_error: Option<String>,
809 candidate_timed_out: bool,
810}
811
812fn build_audit_packet(input: AuditPacketInput<'_>) -> Option<AuditPacket> {
813 let diff = input.diff?.to_string();
814 let diff_hash = input
815 .diff_hash
816 .map(str::to_string)
817 .unwrap_or_else(|| stable_hash_hex(diff.as_bytes()));
818 let patched_score = input.patched_score?;
819 let score_delta = input.score_delta?;
820
821 Some(AuditPacket {
822 schema_version: "0.2".to_string(),
823 agent_name: input.agent_name.to_string(),
824 iteration: input.iteration,
825 edit_scope_contract: "single-file-v0.2".to_string(),
826 accepted_edit: AcceptedEditSummary {
827 description: input
828 .edit_description
829 .unwrap_or("accepted optimizer edit")
830 .to_string(),
831 changed_file: input.edit_file.unwrap_or("unknown").to_string(),
832 diff_hash,
833 diff,
834 },
835 provenance: AuditProvenance {
836 git_sha_before: input.git_sha_before,
837 git_sha_after: input.git_sha_after,
838 working_tree_dirty_after: input.working_tree_dirty_after,
839 policy_path: input.policy_path,
840 policy_hash: input.policy_hash,
841 dataset_version: input.dataset_version.to_string(),
842 dataset_hash: input.dataset_hash.to_string(),
843 scorer_id: input.scorer.id.clone(),
844 scorer_version: input.scorer.version.clone(),
845 model: input.model,
846 },
847 scores: ScoreProvenance {
848 baseline_score: input.baseline_score,
849 patched_score,
850 score_delta,
851 holdout_score: input.holdout_score,
852 },
853 hook_decisions: input.hook_decisions,
854 validation_command_records: input.validation_command_records,
855 final_validation_command_records: input.final_validation_command_records,
856 rollback_succeeded: input.rollback_succeeded,
857 rollback_error: input.rollback_error,
858 candidate_timed_out: input.candidate_timed_out,
859 })
860}
861
862fn load_policy_info(agent_name: &str) -> Option<PolicyInfo> {
863 let cwd = std::env::current_dir().ok()?;
864 let candidates = [
865 cwd.join(".mdx-rust")
866 .join("agents")
867 .join(agent_name)
868 .join("policies.md"),
869 cwd.join(".mdx-rust").join("policies.md"),
870 ];
871
872 candidates
873 .iter()
874 .find_map(|path| std::fs::read(path).ok().map(|content| (path, content)))
875 .map(|(path, content)| PolicyInfo {
876 path: path.clone(),
877 hash: stable_hash_hex(&content),
878 })
879}
880
881fn stable_hash_hex(bytes: &[u8]) -> String {
882 crate::eval::stable_hash_hex(bytes)
883}
884
885fn strategy_for_focus(focus: &str) -> EditStrategy {
886 let normalized = focus.to_lowercase();
887
888 if normalized.contains("tool") {
889 EditStrategy::ToolDescription
890 } else if normalized.contains("fallback") || normalized.contains("logic") {
891 EditStrategy::FallbackLogic
892 } else if normalized.contains("schema") || normalized.contains("output") {
893 EditStrategy::OutputSchema
894 } else if normalized.contains("model") || normalized.contains("temperature") {
895 EditStrategy::ModelConfig
896 } else {
897 EditStrategy::SystemPrompt
898 }
899}
900
901fn fallback_candidates_from_trace(diagnoses: &[TraceDiagnosis]) -> Vec<Candidate> {
902 let mut candidates = Vec::new();
903
904 if diagnoses.iter().any(|diagnosis| {
905 diagnosis
906 .signals
907 .iter()
908 .any(|signal| signal.kind == FailureKind::EchoFallback)
909 }) {
910 candidates.push(Candidate {
911 focus: "fallback_logic".to_string(),
912 description: "Prevent echo fallback and require a useful best-effort answer."
913 .to_string(),
914 expected_improvement: "Reduce low-value echo responses.".to_string(),
915 strategy: Some(EditStrategy::FallbackLogic),
916 });
917 }
918
919 if diagnoses.iter().any(|diagnosis| {
920 diagnosis
921 .signals
922 .iter()
923 .any(|signal| signal.kind == FailureKind::InvalidJson)
924 }) {
925 candidates.push(Candidate {
926 focus: "output_schema".to_string(),
927 description: "Make the output contract explicit for answer, reasoning, and confidence."
928 .to_string(),
929 expected_improvement: "Improve parseability for agent callers.".to_string(),
930 strategy: Some(EditStrategy::OutputSchema),
931 });
932 }
933
934 if diagnoses.iter().any(|diagnosis| {
935 diagnosis.signals.iter().any(|signal| {
936 matches!(
937 signal.kind,
938 FailureKind::MissingReasoning | FailureKind::LowConfidence
939 )
940 })
941 }) {
942 candidates.push(Candidate {
943 focus: "system_prompt".to_string(),
944 description: "Strengthen the system prompt with explicit reasoning instructions."
945 .to_string(),
946 expected_improvement: "Increase reasoning quality and confidence.".to_string(),
947 strategy: Some(EditStrategy::SystemPrompt),
948 });
949 }
950
951 if candidates.is_empty() {
952 candidates.push(Candidate {
953 focus: "system_prompt".to_string(),
954 description: "Strengthen the system prompt with explicit reasoning instructions."
955 .to_string(),
956 expected_improvement: "Improve answer quality.".to_string(),
957 strategy: Some(EditStrategy::SystemPrompt),
958 });
959 }
960
961 candidates
962}
963
964fn summarize_trace_diagnoses(diagnoses: &[TraceDiagnosis]) -> String {
965 let mut summaries = Vec::new();
966
967 for diagnosis in diagnoses {
968 if diagnosis.has_failures() {
969 summaries.push(diagnosis.compact_summary());
970 }
971 }
972
973 if summaries.is_empty() {
974 "No obvious trace failures detected.".to_string()
975 } else {
976 format!("Trace failures: {}", summaries.join(" | "))
977 }
978}
979
980fn build_edit_for_candidate(
981 agent_root: &Path,
982 bundle: Option<&AgentBundle>,
983 candidate: &Candidate,
984) -> anyhow::Result<Option<ProposedEdit>> {
985 let strategy = candidate
986 .strategy
987 .clone()
988 .unwrap_or_else(|| strategy_for_focus(&candidate.focus));
989
990 let Some((target_file, old_preamble)) = select_preamble_target(agent_root, bundle) else {
991 if strategy == EditStrategy::FallbackLogic {
992 return build_echo_fallback_edit(agent_root, bundle, &candidate.description);
993 }
994 return Ok(None);
995 };
996
997 if strategy == EditStrategy::FallbackLogic {
998 if let Some(edit) = build_echo_fallback_edit(agent_root, bundle, &candidate.description)? {
999 return Ok(Some(edit));
1000 }
1001 }
1002
1003 let Some(new_preamble) = evolved_preamble_for_strategy(&old_preamble, &strategy, bundle) else {
1004 return Ok(None);
1005 };
1006
1007 if normalize_prompt(&new_preamble) == normalize_prompt(&old_preamble) {
1008 return Ok(None);
1009 }
1010
1011 let content = std::fs::read_to_string(&target_file)?;
1012 let relative_target = target_file
1013 .strip_prefix(agent_root)
1014 .unwrap_or(&target_file)
1015 .to_path_buf();
1016 let patch = generate_preamble_patch(&relative_target, &content, &old_preamble, &new_preamble);
1017
1018 Ok(Some(ProposedEdit {
1019 file: target_file,
1020 description: format!("{:?}: {}", strategy, candidate.description),
1021 patch,
1022 }))
1023}
1024
1025fn build_echo_fallback_edit(
1026 agent_root: &Path,
1027 bundle: Option<&AgentBundle>,
1028 description: &str,
1029) -> anyhow::Result<Option<ProposedEdit>> {
1030 let mut candidates: Vec<PathBuf> = bundle
1031 .map(|bundle| {
1032 bundle
1033 .scope
1034 .optimizable_paths
1035 .iter()
1036 .filter(|path| path.extension().is_some_and(|extension| extension == "rs"))
1037 .cloned()
1038 .collect()
1039 })
1040 .unwrap_or_default();
1041
1042 if candidates.is_empty() {
1043 candidates.push(agent_root.join("src/main.rs"));
1044 }
1045
1046 for target_file in candidates {
1047 let Ok(content) = std::fs::read_to_string(&target_file) else {
1048 continue;
1049 };
1050
1051 let replacements = [
1052 (
1053 "Echo: {}",
1054 "Best-effort answer after reasoning: {}",
1055 "replace echo fallback format string",
1056 ),
1057 (
1058 "Echo: ",
1059 "Best-effort answer after reasoning: ",
1060 "replace echo fallback prefix",
1061 ),
1062 ];
1063
1064 for (old, new, label) in replacements {
1065 if !content.contains(old) {
1066 continue;
1067 }
1068
1069 let relative_target = target_file
1070 .strip_prefix(agent_root)
1071 .unwrap_or(&target_file)
1072 .to_path_buf();
1073 let patch = generate_preamble_patch(&relative_target, &content, old, new);
1074
1075 return Ok(Some(ProposedEdit {
1076 file: target_file,
1077 description: format!("FallbackLogic: {description} ({label})"),
1078 patch,
1079 }));
1080 }
1081 }
1082
1083 Ok(None)
1084}
1085
1086fn select_preamble_target(
1087 agent_root: &Path,
1088 bundle: Option<&AgentBundle>,
1089) -> Option<(PathBuf, String)> {
1090 if let Some(prompt) = bundle.and_then(|bundle| bundle.preambles.first()) {
1091 return Some((PathBuf::from(&prompt.file), prompt.text.clone()));
1092 }
1093
1094 let target = bundle
1095 .and_then(|bundle| {
1096 bundle.scope.optimizable_paths.iter().find(|path| {
1097 let name = path.file_name().unwrap_or_default().to_string_lossy();
1098 name.ends_with(".rs") && (name == "main.rs" || name.contains("agent"))
1099 })
1100 })
1101 .cloned()
1102 .unwrap_or_else(|| agent_root.join("src/main.rs"));
1103
1104 let content = std::fs::read_to_string(&target).ok()?;
1105 extract_first_preamble_literal(&content).map(|prompt| (target, prompt))
1106}
1107
1108fn extract_first_preamble_literal(content: &str) -> Option<String> {
1109 let marker = ".preamble(\"";
1110 let start = content.find(marker)? + marker.len();
1111 let rest = &content[start..];
1112 let end = rest.find('"')?;
1113 Some(rest[..end].to_string())
1114}
1115
1116fn evolved_preamble_for_strategy(
1117 old: &str,
1118 strategy: &EditStrategy,
1119 bundle: Option<&AgentBundle>,
1120) -> Option<String> {
1121 let addition = match strategy {
1122 EditStrategy::SystemPrompt => {
1123 "Think step-by-step before answering. Always explain your reasoning in one sentence, then give the final answer."
1124 }
1125 EditStrategy::FallbackLogic => {
1126 "Never echo the user input as the final answer. If uncertain, state assumptions, reason briefly, and provide the best useful answer."
1127 }
1128 EditStrategy::OutputSchema => {
1129 "Always produce an answer, reasoning, and confidence from 0 to 1."
1130 }
1131 EditStrategy::ToolDescription => {
1132 let has_tools = bundle.is_some_and(|bundle| !bundle.tools.is_empty());
1133 if !has_tools {
1134 return None;
1135 }
1136 "Before answering, decide whether available tools improve factuality or completeness, and only use them when they add real value."
1137 }
1138 EditStrategy::ModelConfig => return None,
1139 };
1140
1141 if normalize_prompt(old).contains(&normalize_prompt(addition)) {
1142 return Some(old.to_string());
1143 }
1144
1145 let mut base = old.trim().trim_end_matches('.').to_string();
1146 if base.is_empty() {
1147 base = "You are a concise, helpful assistant".to_string();
1148 }
1149 Some(format!("{base}. {addition}"))
1150}
1151
1152fn normalize_prompt(value: &str) -> String {
1153 value
1154 .split_whitespace()
1155 .collect::<Vec<_>>()
1156 .join(" ")
1157 .to_lowercase()
1158}
1159
1160pub fn mechanical_score(result: &AgentRunResult) -> f32 {
1163 let answer = result
1164 .output
1165 .get("answer")
1166 .and_then(|v| v.as_str())
1167 .unwrap_or("");
1168 let reasoning = result
1169 .output
1170 .get("reasoning")
1171 .and_then(|v| v.as_str())
1172 .unwrap_or("");
1173
1174 if answer.starts_with("Echo:") {
1175 return 0.4;
1176 }
1177
1178 let mut score = 0.75f32;
1179
1180 if reasoning.to_lowercase().contains("think")
1182 || reasoning.to_lowercase().contains("reason")
1183 || reasoning.to_lowercase().contains("step")
1184 {
1185 score += 0.12;
1186 }
1187
1188 if answer.len() > 20 {
1190 score += 0.08;
1191 }
1192
1193 score.min(0.95)
1194}
1195
1196#[cfg(test)]
1197mod tests {
1198 use super::*;
1199 use tempfile::tempdir;
1200
1201 #[test]
1202 fn test_mechanical_score_echo_vs_reasoned() {
1203 let echo = AgentRunResult {
1204 output: serde_json::json!({"answer": "Echo: hello", "reasoning": "no key"}),
1205 duration_ms: 10,
1206 success: true,
1207 error: None,
1208 traces: vec![],
1209 };
1210 let good = AgentRunResult {
1211 output: serde_json::json!({"answer": "The answer is 42 because...", "reasoning": "Think step by step: 6*7"}),
1212 duration_ms: 120,
1213 success: true,
1214 error: None,
1215 traces: vec![],
1216 };
1217
1218 assert!(mechanical_score(&echo) < 0.5);
1219 assert!(mechanical_score(&good) > 0.8);
1220 }
1221
1222 #[test]
1223 fn test_optimize_config_defaults() {
1224 let cfg = OptimizeConfig {
1225 max_iterations: 1,
1226 candidates_per_iteration: 1,
1227 use_llm_judge: false,
1228 budget: OptimizationBudget::Medium,
1229 hook_policy: HookPolicy::default(),
1230 review_before_apply: false,
1231 quiet: false,
1232 candidate_timeout: default_candidate_timeout(),
1233 };
1234 assert_eq!(cfg.max_iterations, 1);
1235 }
1236
1237 #[test]
1238 fn strategy_for_focus_maps_common_candidate_names() {
1239 assert_eq!(
1240 strategy_for_focus("improve tool descriptions"),
1241 EditStrategy::ToolDescription
1242 );
1243 assert_eq!(
1244 strategy_for_focus("fix fallback logic"),
1245 EditStrategy::FallbackLogic
1246 );
1247 assert_eq!(
1248 strategy_for_focus("tighten output schema"),
1249 EditStrategy::OutputSchema
1250 );
1251 assert_eq!(
1252 strategy_for_focus("lower model temperature"),
1253 EditStrategy::ModelConfig
1254 );
1255 assert_eq!(strategy_for_focus("reasoning"), EditStrategy::SystemPrompt);
1256 }
1257
1258 #[test]
1259 fn fallback_candidates_follow_trace_failures() {
1260 let candidates = fallback_candidates_from_trace(&[TraceDiagnosis {
1261 signals: vec![
1262 crate::FailureSignal {
1263 kind: FailureKind::EchoFallback,
1264 severity: 2,
1265 evidence: "Echo: hello".to_string(),
1266 span_id: None,
1267 },
1268 crate::FailureSignal {
1269 kind: FailureKind::InvalidJson,
1270 severity: 2,
1271 evidence: "raw stdout".to_string(),
1272 span_id: None,
1273 },
1274 ],
1275 ranked_span_ids: vec![],
1276 }]);
1277
1278 assert_eq!(candidates[0].strategy, Some(EditStrategy::FallbackLogic));
1279 assert!(candidates
1280 .iter()
1281 .any(|candidate| candidate.strategy == Some(EditStrategy::OutputSchema)));
1282 }
1283
1284 #[test]
1285 fn build_edit_for_candidate_creates_schema_preamble_patch() {
1286 let dir = tempdir().unwrap();
1287 let src = dir.path().join("src");
1288 std::fs::create_dir_all(&src).unwrap();
1289 let main = src.join("main.rs");
1290 std::fs::write(
1291 &main,
1292 r#"fn main() { let _agent = client.agent("m").preamble("You are helpful.").build(); }"#,
1293 )
1294 .unwrap();
1295
1296 let candidate = Candidate {
1297 focus: "output_schema".to_string(),
1298 description: "make output contract explicit".to_string(),
1299 expected_improvement: "more parseable output".to_string(),
1300 strategy: Some(EditStrategy::OutputSchema),
1301 };
1302
1303 let edit = build_edit_for_candidate(dir.path(), None, &candidate)
1304 .unwrap()
1305 .expect("schema strategy should produce a prompt edit");
1306
1307 assert_eq!(edit.file, main);
1308 assert!(edit.patch.contains("answer, reasoning, and confidence"));
1309 }
1310
1311 #[test]
1312 fn tool_strategy_requires_discovered_tools() {
1313 let dir = tempdir().unwrap();
1314 let src = dir.path().join("src");
1315 std::fs::create_dir_all(&src).unwrap();
1316 let main = src.join("main.rs");
1317 std::fs::write(
1318 &main,
1319 r#"fn main() { let _agent = client.agent("m").preamble("You are helpful.").build(); }"#,
1320 )
1321 .unwrap();
1322
1323 let candidate = Candidate {
1324 focus: "tool_description".to_string(),
1325 description: "clarify tool use".to_string(),
1326 expected_improvement: "better tool calls".to_string(),
1327 strategy: Some(EditStrategy::ToolDescription),
1328 };
1329
1330 let without_tools = build_edit_for_candidate(dir.path(), None, &candidate).unwrap();
1331 assert!(without_tools.is_none());
1332
1333 let bundle = AgentBundle {
1334 scope: mdx_rust_analysis::BundleScope {
1335 optimizable_paths: vec![main],
1336 read_only_paths: vec![],
1337 },
1338 preambles: vec![],
1339 tools: vec![mdx_rust_analysis::ExtractedTool {
1340 file: "src/main.rs".to_string(),
1341 name: "search".to_string(),
1342 description: None,
1343 }],
1344 is_rig_agent: true,
1345 key_files: vec![],
1346 };
1347
1348 let with_tools = build_edit_for_candidate(dir.path(), Some(&bundle), &candidate)
1349 .unwrap()
1350 .expect("tool strategy should produce a prompt edit when tools exist");
1351 assert!(with_tools
1352 .patch
1353 .contains("available tools improve factuality"));
1354 }
1355
1356 #[test]
1357 fn fallback_logic_strategy_can_patch_echo_fallback() {
1358 let dir = tempdir().unwrap();
1359 let src = dir.path().join("src");
1360 std::fs::create_dir_all(&src).unwrap();
1361 let main = src.join("main.rs");
1362 std::fs::write(
1363 &main,
1364 r#"fn main() { println!("{}", format!("Echo: {}", "hello")); }"#,
1365 )
1366 .unwrap();
1367
1368 let candidate = Candidate {
1369 focus: "fallback_logic".to_string(),
1370 description: "avoid echo fallback".to_string(),
1371 expected_improvement: "more useful fallback".to_string(),
1372 strategy: Some(EditStrategy::FallbackLogic),
1373 };
1374
1375 let edit = build_edit_for_candidate(dir.path(), None, &candidate)
1376 .unwrap()
1377 .expect("fallback logic should patch simple echo fallback");
1378
1379 assert_eq!(edit.file, main);
1380 assert!(edit.patch.contains("Best-effort answer after reasoning"));
1381 }
1382
1383 #[test]
1384 fn agent_facing_records_have_json_schemas() {
1385 let audit_schema = schemars::schema_for!(AuditPacket);
1386 let candidate_schema = schemars::schema_for!(Candidate);
1387 let config_schema = schemars::schema_for!(OptimizeConfig);
1388
1389 assert_eq!(
1390 audit_schema
1391 .schema
1392 .metadata
1393 .as_ref()
1394 .and_then(|m| m.title.as_deref()),
1395 Some("AuditPacket")
1396 );
1397 assert_eq!(
1398 candidate_schema
1399 .schema
1400 .metadata
1401 .as_ref()
1402 .and_then(|m| m.title.as_deref()),
1403 Some("Candidate")
1404 );
1405 assert_eq!(
1406 config_schema
1407 .schema
1408 .metadata
1409 .as_ref()
1410 .and_then(|m| m.title.as_deref()),
1411 Some("OptimizeConfig")
1412 );
1413 }
1414}