mdx_rust_core/
optimizer.rs

1//! The core optimization loop (Phase 3).
2//!
3//! High-level flow (per the approved plan):
4//! 1. Run the agent on the dataset while collecting rich traces.
5//! 2. Score outputs (mechanical rules + optional LLM-as-Judge).
6//! 3. Diagnose failures using a strong model + policy + traces + code bundle.
7//! 4. Generate N targeted candidate fixes (different focus areas).
8//! 5. Validate candidates safely (cargo check + clippy + smoke tests in worktree).
9//! 6. Evaluate survivors on the full dataset.
10//! 7. Accept only net-positive changes with regression guards + holdout set.
11//!
12//! This module is currently a structural skeleton. Real implementations of
13//! the individual steps will be filled in as the analysis crate and LLM
14//! client mature.
15
16use crate::registry::RegisteredAgent;
17use crate::runner::AgentRunResult;
18use crate::safety_pipeline::{
19    execute_candidate_edit, CandidateExecutionConfig, CandidateExecutionContext,
20};
21use crate::{
22    diagnose_run, split_dataset, EvaluationDataset, ExperimentLedger, FailureKind, HookDecision,
23    HookPolicy, OptimizationBudget, PromptVariantRecord, ScorerMetadata, TraceDiagnosis,
24};
25use mdx_rust_analysis::editing::ProposedEdit;
26use mdx_rust_analysis::editing::ValidationCommandRecord;
27use mdx_rust_analysis::AgentBundle;
28use serde::{Deserialize, Serialize};
29use std::path::{Path, PathBuf};
30use std::time::Duration;
31
32/// Generate a proper unified diff with surrounding context for a preamble string change.
33/// This produces something `git apply` can reliably use.
34fn generate_preamble_patch(file_path: &Path, source: &str, old: &str, new: &str) -> String {
35    let diff_path = file_path.to_string_lossy();
36
37    if !source.contains(old) {
38        // Fallback: still produce something the later fallback in apply_patch can use
39        return format!(
40            "diff --git a/{diff_path} b/{diff_path}\n--- a/{diff_path}\n+++ b/{diff_path}\n@@ -1,1 +1,1 @@\n-{old}\n+{new}\n"
41        );
42    }
43
44    let lines: Vec<&str> = source.lines().collect();
45    let mut patch_lines = Vec::new();
46    patch_lines.push(format!("diff --git a/{diff_path} b/{diff_path}"));
47    patch_lines.push(format!("--- a/{diff_path}"));
48    patch_lines.push(format!("+++ b/{diff_path}"));
49
50    // Find the line containing the old preamble
51    let mut hunk_start = 0usize;
52    let mut old_line_idx = None;
53    for (i, line) in lines.iter().enumerate() {
54        if line.contains(old) {
55            old_line_idx = Some(i);
56            hunk_start = i.saturating_sub(3);
57            break;
58        }
59    }
60
61    if let Some(idx) = old_line_idx {
62        let context_before = &lines[hunk_start..idx];
63        let context_after = if idx + 1 < lines.len() {
64            &lines[idx + 1..(idx + 1 + 3).min(lines.len())]
65        } else {
66            &[][..]
67        };
68
69        let new_line = lines[idx].replace(old, new);
70
71        let hunk_header = format!(
72            "@@ -{},{} +{},{} @@",
73            hunk_start + 1,
74            context_before.len() + 1 + context_after.len(),
75            hunk_start + 1,
76            context_before.len() + 1 + context_after.len()
77        );
78        patch_lines.push(hunk_header);
79
80        for l in context_before {
81            patch_lines.push(format!(" {}", l));
82        }
83        patch_lines.push(format!("-{}", lines[idx]));
84        patch_lines.push(format!("+{}", new_line));
85        for l in context_after {
86            patch_lines.push(format!(" {}", l));
87        }
88    } else {
89        // very minimal fallback
90        patch_lines.push("@@ -1,1 +1,1 @@".to_string());
91        patch_lines.push(format!("-{}", old));
92        patch_lines.push(format!("+{}", new));
93    }
94
95    patch_lines.join("\n")
96}
97
98/// Configuration for a single optimization run.
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct OptimizeConfig {
101    pub max_iterations: u32,
102    pub candidates_per_iteration: u32,
103    pub use_llm_judge: bool,
104    #[serde(default)]
105    pub budget: OptimizationBudget,
106    #[serde(default)]
107    pub hook_policy: HookPolicy,
108    /// When true, the optimizer will print proposed changes and wait for confirmation before applying (Phase 4 review gate).
109    #[serde(default)]
110    pub review_before_apply: bool,
111    /// When true, suppress all human progress output (used for --json mode).
112    #[serde(default)]
113    pub quiet: bool,
114    #[serde(skip, default = "default_candidate_timeout")]
115    pub candidate_timeout: Duration,
116}
117
118/// A single optimization experiment / iteration result.
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct OptimizationRun {
121    pub iteration: u32,
122    pub scores: Vec<f32>,
123    /// Number of changes that were fully validated in isolation
124    pub validated_changes: u32,
125    /// Number of changes that were successfully landed on the real agent tree
126    pub landed_changes: u32,
127    /// Number of changes that were accepted (landed + final validation + net-positive)
128    pub accepted_changes: u32,
129    pub notes: String,
130    pub candidates: Vec<Candidate>,
131    /// Optional unified diff of the last accepted change
132    #[serde(default)]
133    pub diff: Option<String>,
134    #[serde(default)]
135    pub policy_hash: Option<String>,
136    #[serde(default)]
137    pub dataset_version: Option<String>,
138    #[serde(default)]
139    pub dataset_hash: Option<String>,
140    // Net-positive evaluation (P1 stabilization)
141    #[serde(default)]
142    pub baseline_score: Option<f32>,
143    #[serde(default)]
144    pub patched_score: Option<f32>,
145    #[serde(default)]
146    pub score_delta: Option<f32>,
147
148    // Real provenance (P1 requirement) — populated when a change is accepted
149    #[serde(default)]
150    pub git_sha_before: Option<String>,
151    #[serde(default)]
152    pub git_sha_after: Option<String>,
153    #[serde(default)]
154    pub diff_hash: Option<String>,
155    #[serde(default)]
156    pub working_tree_dirty_after: Option<bool>,
157    #[serde(default)]
158    pub scorer: Option<String>,
159    #[serde(default)]
160    pub validation_commands: Option<Vec<String>>,
161    #[serde(default)]
162    pub validation_command_records: Vec<ValidationCommandRecord>,
163    #[serde(default)]
164    pub final_validation_command_records: Vec<ValidationCommandRecord>,
165    #[serde(default)]
166    pub trace_diagnosis: Vec<TraceDiagnosis>,
167    #[serde(default)]
168    pub hook_decisions: Vec<HookDecision>,
169    #[serde(default)]
170    pub ledger: Option<ExperimentLedger>,
171    #[serde(default)]
172    pub holdout_score: Option<f32>,
173    #[serde(default)]
174    pub budget: Option<OptimizationBudget>,
175    #[serde(default)]
176    pub policy_path: Option<String>,
177    #[serde(default)]
178    pub model: Option<ModelProvenance>,
179    #[serde(default)]
180    pub rollback_succeeded: Option<bool>,
181    #[serde(default)]
182    pub rollback_error: Option<String>,
183    #[serde(default)]
184    pub candidate_timed_out: bool,
185}
186
187#[derive(Debug, Clone, Serialize, Deserialize)]
188pub struct ModelProvenance {
189    pub role: String,
190    pub provider: String,
191    pub model: String,
192    pub used: bool,
193}
194
195/// A proposed improvement generated during an optimization iteration.
196#[derive(Debug, Clone, Serialize, Deserialize)]
197pub struct Candidate {
198    pub focus: String, // e.g. "system_prompt", "tool_descriptions", "reasoning_step"
199    pub description: String,
200    pub expected_improvement: String,
201    #[serde(default)]
202    pub strategy: Option<EditStrategy>,
203}
204
205#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
206pub enum EditStrategy {
207    SystemPrompt,
208    ToolDescription,
209    FallbackLogic,
210    OutputSchema,
211    ModelConfig,
212}
213
214fn default_candidate_timeout() -> Duration {
215    Duration::from_secs(300)
216}
217
218/// Placeholder for the full optimization engine.
219/// In a real implementation this would orchestrate:
220/// - the runner
221/// - the analysis crate (for bundling + editing)
222/// - an LLM client (for diagnosis + candidate generation)
223/// - the safe editing/validation pipeline
224pub async fn run_optimization(
225    agent: &RegisteredAgent,
226    config: &OptimizeConfig,
227) -> anyhow::Result<Vec<OptimizationRun>> {
228    let mut runs = vec![];
229
230    let dataset = EvaluationDataset::synthetic_v1();
231    let split = split_dataset(&dataset, config.budget);
232    let mut ledger = ExperimentLedger::new(config.budget, &dataset, &split);
233    let dataset_hash = dataset.content_hash();
234    let scorer = ScorerMetadata::mechanical_v1();
235    let test_inputs: Vec<serde_json::Value> = split
236        .train
237        .iter()
238        .map(|sample| sample.input.clone())
239        .collect();
240    let holdout_inputs: Vec<serde_json::Value> = split
241        .holdout
242        .iter()
243        .map(|sample| sample.input.clone())
244        .collect();
245
246    // Baseline evaluation (computed once for net-positive comparison)
247    let baseline_score: f32 = {
248        let mut total = 0.0f32;
249        for input in &test_inputs {
250            if let Ok(res) = crate::runner::run_agent(agent, input.clone()).await {
251                total += mechanical_score(&res);
252            }
253        }
254        if test_inputs.is_empty() {
255            0.0
256        } else {
257            total / test_inputs.len() as f32
258        }
259    };
260
261    // Provenance: git sha before any optimization changes (P1 requirement)
262    let git_sha_before: Option<String> = std::process::Command::new("git")
263        .current_dir(&agent.path)
264        .args(["rev-parse", "--short", "HEAD"])
265        .output()
266        .ok()
267        .and_then(|o| {
268            if o.status.success() {
269                Some(String::from_utf8_lossy(&o.stdout).trim().to_string())
270            } else {
271                None
272            }
273        });
274    let policy_info = load_policy_info(&agent.name);
275
276    for iteration in 0..config.max_iterations {
277        let mut scores_this_iter = vec![];
278        let mut accepted_patched: Option<f32> = None;
279        let mut accepted_delta: Option<f32> = None;
280        let mut validated = 0;
281        let mut landed = 0;
282        let mut trace_diagnoses = Vec::new();
283        let mut hook_decisions = Vec::new();
284        let mut accepted_holdout_score = None;
285        let mut accepted_validation_commands = Vec::new();
286        let mut accepted_final_validation_commands = Vec::new();
287        let mut accepted_rollback_succeeded = None;
288        let mut accepted_rollback_error = None;
289        let mut any_candidate_timed_out = false;
290
291        for input in &test_inputs {
292            let run_result = crate::runner::run_agent(agent, input.clone()).await?;
293            trace_diagnoses.push(diagnose_run(&run_result));
294            let score = mechanical_score(&run_result);
295            scores_this_iter.push(score);
296        }
297
298        let avg_score: f32 = if scores_this_iter.is_empty() {
299            0.0
300        } else {
301            scores_this_iter.iter().sum::<f32>() / scores_this_iter.len() as f32
302        };
303
304        // Rich analysis: extract real preambles, tools, entrypoints
305        let rich_bundle = mdx_rust_analysis::analyze_agent(&agent.path, None).ok();
306        let file_count = rich_bundle
307            .as_ref()
308            .map(|b| b.scope.optimizable_paths.len())
309            .unwrap_or(0);
310
311        // Build a high-signal summary for the LLM
312        let bundle_summary = if let Some(ref b) = rich_bundle {
313            let mut s = format!(
314                "{} source files, Rig agent = {}",
315                file_count, b.is_rig_agent
316            );
317            if !b.preambles.is_empty() {
318                s.push_str(&format!(
319                    ", current preambles: {:?}",
320                    b.preambles.iter().map(|p| &p.text).collect::<Vec<_>>()
321                ));
322            }
323            if !b.tools.is_empty() {
324                s.push_str(&format!(
325                    ", tools: {:?}",
326                    b.tools.iter().map(|t| &t.name).collect::<Vec<_>>()
327                ));
328            }
329            s
330        } else {
331            format!("{} source files (limited analysis)", file_count)
332        };
333
334        let llm = crate::llm::LlmClient::default();
335        let diag_req = crate::llm::DiagnosisRequest {
336            policy: "Improve the agent so it gives high-quality, reasoned answers instead of echoing. Prefer explicit step-by-step reasoning in the system prompt.".to_string(),
337            bundle_summary,
338            traces_summary: summarize_trace_diagnoses(&trace_diagnoses),
339            scores: scores_this_iter.clone(),
340        };
341
342        let diagnosis_result = llm.diagnose(diag_req).await;
343        let diagnosis_model_used = diagnosis_result.is_ok();
344        let diagnosis = diagnosis_result.ok();
345
346        let mut candidates = vec![];
347        let mut accepted = 0;
348        let mut notes = format!(
349            "Avg score this iter: {:.2} ({} files in bundle)",
350            avg_score, file_count
351        );
352        let mut accepted_diff: Option<String> = None;
353
354        if let Some(d) = diagnosis {
355            notes.push_str(&format!(" → LLM: {}", d.summary));
356            for c in d.candidates {
357                let strategy = strategy_for_focus(&c.focus);
358                candidates.push(Candidate {
359                    focus: c.focus,
360                    description: c.description,
361                    expected_improvement: c.expected_improvement,
362                    strategy: Some(strategy),
363                });
364            }
365        } else {
366            candidates = fallback_candidates_from_trace(&trace_diagnoses);
367        }
368
369        if !candidates.is_empty() {
370            let candidate_limit = config
371                .budget
372                .candidate_limit(config.candidates_per_iteration);
373            for (candidate_index, candidate) in candidates.iter().take(candidate_limit).enumerate()
374            {
375                if accepted > 0 {
376                    break;
377                }
378
379                let Some(edit) =
380                    build_edit_for_candidate(&agent.path, rich_bundle.as_ref(), candidate)?
381                else {
382                    notes.push_str(&format!(
383                        " (candidate {} skipped: no safe edit plan for {:?})",
384                        candidate.focus, candidate.strategy
385                    ));
386                    continue;
387                };
388
389                notes.push_str(&format!(
390                    " → Candidate {}: {} ({:?})",
391                    candidate_index + 1,
392                    candidate.focus,
393                    candidate.strategy
394                ));
395
396                ledger.record_variant(PromptVariantRecord::from_patch(
397                    format!("{:?}", candidate.strategy),
398                    edit.file.display().to_string(),
399                    edit.description.clone(),
400                    &edit.patch,
401                ));
402
403                let outcome = execute_candidate_edit(CandidateExecutionContext {
404                    agent,
405                    config: CandidateExecutionConfig {
406                        hook_policy: &config.hook_policy,
407                        review_before_apply: config.review_before_apply,
408                        quiet: config.quiet,
409                        candidate_timeout: config.candidate_timeout,
410                    },
411                    iteration,
412                    candidate_index,
413                    edit: &edit,
414                    test_inputs: &test_inputs,
415                    holdout_inputs: &holdout_inputs,
416                    baseline_score,
417                    scorer: mechanical_score,
418                })
419                .await;
420
421                validated += outcome.validated;
422                landed += outcome.landed;
423                any_candidate_timed_out |= outcome.timed_out;
424                hook_decisions.extend(outcome.hook_decisions);
425
426                if outcome.accepted > 0 {
427                    accepted = outcome.accepted;
428                    accepted_diff = outcome.accepted_diff;
429                    accepted_patched = outcome.patched_score;
430                    accepted_delta = outcome.delta;
431                    accepted_holdout_score = outcome.holdout_score;
432                    accepted_validation_commands = outcome.validation_commands;
433                    accepted_final_validation_commands = outcome.final_validation_commands;
434                    accepted_rollback_succeeded = outcome.rollback_succeeded;
435                    accepted_rollback_error = outcome.rollback_error;
436                }
437
438                notes.push_str(&outcome.note);
439            }
440        } else {
441            accepted = 0; // No change was proposed or needed
442            notes.push_str(" → No new candidates — current behavior is good (no change applied)");
443        }
444
445        let (run_baseline, run_patched, run_delta) = if accepted > 0 {
446            (Some(baseline_score), accepted_patched, accepted_delta)
447        } else {
448            (None, None, None)
449        };
450
451        // Populate real provenance when we accepted a change (P1)
452        let (prov_before, prov_after, prov_diff_hash, prov_dirty, prov_scorer, prov_cmds) =
453            if accepted > 0 {
454                let after = std::process::Command::new("git")
455                    .current_dir(&agent.path)
456                    .args(["rev-parse", "--short", "HEAD"])
457                    .output()
458                    .ok()
459                    .and_then(|o| {
460                        if o.status.success() {
461                            Some(String::from_utf8_lossy(&o.stdout).trim().to_string())
462                        } else {
463                            None
464                        }
465                    });
466                let dirty_after = std::process::Command::new("git")
467                    .current_dir(&agent.path)
468                    .args(["status", "--porcelain"])
469                    .output()
470                    .ok()
471                    .filter(|output| output.status.success())
472                    .map(|output| !output.stdout.is_empty());
473
474                (
475                    git_sha_before.clone(),
476                    after,
477                    accepted_diff
478                        .as_ref()
479                        .map(|diff| stable_hash_hex(diff.as_bytes())),
480                    dirty_after,
481                    Some(scorer.label()),
482                    Some(vec![
483                        "cargo check (isolated)".to_string(),
484                        "cargo clippy -D warnings (isolated)".to_string(),
485                        "final validate_build after land (real tree)".to_string(),
486                    ]),
487                )
488            } else {
489                (None, None, None, None, None, None)
490            };
491
492        runs.push(OptimizationRun {
493            iteration,
494            scores: scores_this_iter,
495            validated_changes: validated,
496            landed_changes: landed,
497            accepted_changes: accepted,
498            notes,
499            candidates,
500            diff: accepted_diff,
501            policy_hash: policy_info.as_ref().map(|policy| policy.hash.clone()),
502            dataset_version: Some(dataset.version.clone()),
503            dataset_hash: Some(dataset_hash.clone()),
504            baseline_score: run_baseline,
505            patched_score: run_patched,
506            score_delta: run_delta,
507            git_sha_before: prov_before,
508            git_sha_after: prov_after,
509            diff_hash: prov_diff_hash,
510            working_tree_dirty_after: prov_dirty,
511            scorer: prov_scorer,
512            validation_commands: prov_cmds,
513            validation_command_records: accepted_validation_commands,
514            final_validation_command_records: accepted_final_validation_commands,
515            trace_diagnosis: trace_diagnoses,
516            hook_decisions,
517            ledger: Some(ledger.clone()),
518            holdout_score: accepted_holdout_score,
519            budget: Some(config.budget),
520            policy_path: policy_info
521                .as_ref()
522                .map(|policy| policy.path.display().to_string()),
523            model: Some(llm.provenance(diagnosis_model_used)),
524            rollback_succeeded: accepted_rollback_succeeded,
525            rollback_error: accepted_rollback_error,
526            candidate_timed_out: any_candidate_timed_out,
527        });
528
529        if accepted > 0 && iteration > 0 {
530            // In real version we'd apply a safe edit here
531        }
532    }
533
534    // Persist this optimization experiment under the agent's directory
535    let experiment_dir = std::env::current_dir()?
536        .join(".mdx-rust")
537        .join("agents")
538        .join(&agent.name)
539        .join("experiments");
540
541    std::fs::create_dir_all(&experiment_dir).ok();
542
543    let timestamp = std::time::SystemTime::now()
544        .duration_since(std::time::UNIX_EPOCH)
545        .map(|d| d.as_secs())
546        .unwrap_or(0);
547
548    let experiment_file = experiment_dir.join(format!("run-{}.json", timestamp));
549    if let Ok(content) = serde_json::to_string_pretty(&runs) {
550        let _ = std::fs::write(experiment_file, content);
551    }
552
553    // Also write a rich human-readable report with provenance
554    if runs.iter().any(|r| r.accepted_changes > 0) {
555        let git_sha = std::process::Command::new("git")
556            .args(["rev-parse", "--short", "HEAD"])
557            .output()
558            .ok()
559            .and_then(|o| String::from_utf8(o.stdout).ok())
560            .map(|s| s.trim().to_string())
561            .unwrap_or_else(|| "unknown".to_string());
562
563        let mut report = format!(
564            "# Optimization Report for '{}'\n\nTimestamp: {}\nGit SHA: {}\n\n## Summary\n\n",
565            agent.name, timestamp, git_sha
566        );
567
568        for run in &runs {
569            if run.accepted_changes > 0 {
570                report.push_str(&format!(
571                    "- Iteration {}: Accepted {} change(s)\n  Notes: {}\n",
572                    run.iteration, run.accepted_changes, run.notes
573                ));
574
575                if let Some(d) = &run.diff {
576                    report.push_str(&format!("\n```diff\n{}\n```\n", d));
577                } else {
578                    report.push_str("  (Change persisted to src/main.rs)\n");
579                }
580
581                if let Some(h) = &run.policy_hash {
582                    report.push_str(&format!("  Policy hash: {}\n", h));
583                }
584                if let Some(v) = &run.dataset_version {
585                    report.push_str(&format!("  Dataset version: {}\n", v));
586                }
587                if let Some(path) = &run.policy_path {
588                    report.push_str(&format!("  Policy path: {}\n", path));
589                }
590                if let Some(model) = &run.model {
591                    report.push_str(&format!(
592                        "  Diagnosis model: {}:{} (used={})\n",
593                        model.provider, model.model, model.used
594                    ));
595                }
596                if !run.validation_command_records.is_empty() {
597                    report.push_str("  Isolated validation commands:\n");
598                    for command in &run.validation_command_records {
599                        report.push_str(&format!(
600                            "    - {} | success={} | timeout={} | status={:?} | duration_ms={}\n",
601                            command.command,
602                            command.success,
603                            command.timed_out,
604                            command.status_code,
605                            command.duration_ms
606                        ));
607                    }
608                }
609                if !run.final_validation_command_records.is_empty() {
610                    report.push_str("  Final validation commands:\n");
611                    for command in &run.final_validation_command_records {
612                        report.push_str(&format!(
613                            "    - {} | success={} | timeout={} | status={:?} | duration_ms={}\n",
614                            command.command,
615                            command.success,
616                            command.timed_out,
617                            command.status_code,
618                            command.duration_ms
619                        ));
620                    }
621                }
622            }
623        }
624
625        report.push_str("\n## Candidates Considered\n\n");
626        for run in &runs {
627            for (i, c) in run.candidates.iter().enumerate() {
628                report.push_str(&format!(
629                    "- [{}] {}: {}\n  Expected: {}\n\n",
630                    i + 1,
631                    c.focus,
632                    c.description,
633                    c.expected_improvement
634                ));
635            }
636        }
637
638        let _ = std::fs::write(
639            experiment_dir.join(format!("report-{}.md", timestamp)),
640            report,
641        );
642    }
643
644    // Final re-evaluation after any accepted changes (shows the win)
645    if runs.iter().any(|r| r.accepted_changes > 0) {
646        let mut final_scores = vec![];
647        for input in &test_inputs {
648            if let Ok(res) = crate::runner::run_agent(agent, input.clone()).await {
649                final_scores.push(mechanical_score(&res));
650            }
651        }
652        if !final_scores.is_empty() {
653            let final_avg = final_scores.iter().sum::<f32>() / final_scores.len() as f32;
654            if !config.quiet {
655                println!(
656                    "   Final re-evaluation after accepted changes: {:.2}",
657                    final_avg
658                );
659            }
660        }
661    }
662
663    Ok(runs)
664}
665
666#[derive(Debug, Clone)]
667struct PolicyInfo {
668    path: PathBuf,
669    hash: String,
670}
671
672fn load_policy_info(agent_name: &str) -> Option<PolicyInfo> {
673    let cwd = std::env::current_dir().ok()?;
674    let candidates = [
675        cwd.join(".mdx-rust")
676            .join("agents")
677            .join(agent_name)
678            .join("policies.md"),
679        cwd.join(".mdx-rust").join("policies.md"),
680    ];
681
682    candidates
683        .iter()
684        .find_map(|path| std::fs::read(path).ok().map(|content| (path, content)))
685        .map(|(path, content)| PolicyInfo {
686            path: path.clone(),
687            hash: stable_hash_hex(&content),
688        })
689}
690
691fn stable_hash_hex(bytes: &[u8]) -> String {
692    crate::eval::stable_hash_hex(bytes)
693}
694
695fn strategy_for_focus(focus: &str) -> EditStrategy {
696    let normalized = focus.to_lowercase();
697
698    if normalized.contains("tool") {
699        EditStrategy::ToolDescription
700    } else if normalized.contains("fallback") || normalized.contains("logic") {
701        EditStrategy::FallbackLogic
702    } else if normalized.contains("schema") || normalized.contains("output") {
703        EditStrategy::OutputSchema
704    } else if normalized.contains("model") || normalized.contains("temperature") {
705        EditStrategy::ModelConfig
706    } else {
707        EditStrategy::SystemPrompt
708    }
709}
710
711fn fallback_candidates_from_trace(diagnoses: &[TraceDiagnosis]) -> Vec<Candidate> {
712    let mut candidates = Vec::new();
713
714    if diagnoses.iter().any(|diagnosis| {
715        diagnosis
716            .signals
717            .iter()
718            .any(|signal| signal.kind == FailureKind::EchoFallback)
719    }) {
720        candidates.push(Candidate {
721            focus: "fallback_logic".to_string(),
722            description: "Prevent echo fallback and require a useful best-effort answer."
723                .to_string(),
724            expected_improvement: "Reduce low-value echo responses.".to_string(),
725            strategy: Some(EditStrategy::FallbackLogic),
726        });
727    }
728
729    if diagnoses.iter().any(|diagnosis| {
730        diagnosis
731            .signals
732            .iter()
733            .any(|signal| signal.kind == FailureKind::InvalidJson)
734    }) {
735        candidates.push(Candidate {
736            focus: "output_schema".to_string(),
737            description: "Make the output contract explicit for answer, reasoning, and confidence."
738                .to_string(),
739            expected_improvement: "Improve parseability for agent callers.".to_string(),
740            strategy: Some(EditStrategy::OutputSchema),
741        });
742    }
743
744    if diagnoses.iter().any(|diagnosis| {
745        diagnosis.signals.iter().any(|signal| {
746            matches!(
747                signal.kind,
748                FailureKind::MissingReasoning | FailureKind::LowConfidence
749            )
750        })
751    }) {
752        candidates.push(Candidate {
753            focus: "system_prompt".to_string(),
754            description: "Strengthen the system prompt with explicit reasoning instructions."
755                .to_string(),
756            expected_improvement: "Increase reasoning quality and confidence.".to_string(),
757            strategy: Some(EditStrategy::SystemPrompt),
758        });
759    }
760
761    if candidates.is_empty() {
762        candidates.push(Candidate {
763            focus: "system_prompt".to_string(),
764            description: "Strengthen the system prompt with explicit reasoning instructions."
765                .to_string(),
766            expected_improvement: "Improve answer quality.".to_string(),
767            strategy: Some(EditStrategy::SystemPrompt),
768        });
769    }
770
771    candidates
772}
773
774fn summarize_trace_diagnoses(diagnoses: &[TraceDiagnosis]) -> String {
775    let mut summaries = Vec::new();
776
777    for diagnosis in diagnoses {
778        if diagnosis.has_failures() {
779            summaries.push(diagnosis.compact_summary());
780        }
781    }
782
783    if summaries.is_empty() {
784        "No obvious trace failures detected.".to_string()
785    } else {
786        format!("Trace failures: {}", summaries.join(" | "))
787    }
788}
789
790fn build_edit_for_candidate(
791    agent_root: &Path,
792    bundle: Option<&AgentBundle>,
793    candidate: &Candidate,
794) -> anyhow::Result<Option<ProposedEdit>> {
795    let strategy = candidate
796        .strategy
797        .clone()
798        .unwrap_or_else(|| strategy_for_focus(&candidate.focus));
799
800    let Some((target_file, old_preamble)) = select_preamble_target(agent_root, bundle) else {
801        if strategy == EditStrategy::FallbackLogic {
802            return build_echo_fallback_edit(agent_root, bundle, &candidate.description);
803        }
804        return Ok(None);
805    };
806
807    if strategy == EditStrategy::FallbackLogic {
808        if let Some(edit) = build_echo_fallback_edit(agent_root, bundle, &candidate.description)? {
809            return Ok(Some(edit));
810        }
811    }
812
813    let Some(new_preamble) = evolved_preamble_for_strategy(&old_preamble, &strategy, bundle) else {
814        return Ok(None);
815    };
816
817    if normalize_prompt(&new_preamble) == normalize_prompt(&old_preamble) {
818        return Ok(None);
819    }
820
821    let content = std::fs::read_to_string(&target_file)?;
822    let relative_target = target_file
823        .strip_prefix(agent_root)
824        .unwrap_or(&target_file)
825        .to_path_buf();
826    let patch = generate_preamble_patch(&relative_target, &content, &old_preamble, &new_preamble);
827
828    Ok(Some(ProposedEdit {
829        file: target_file,
830        description: format!("{:?}: {}", strategy, candidate.description),
831        patch,
832    }))
833}
834
835fn build_echo_fallback_edit(
836    agent_root: &Path,
837    bundle: Option<&AgentBundle>,
838    description: &str,
839) -> anyhow::Result<Option<ProposedEdit>> {
840    let mut candidates: Vec<PathBuf> = bundle
841        .map(|bundle| {
842            bundle
843                .scope
844                .optimizable_paths
845                .iter()
846                .filter(|path| path.extension().is_some_and(|extension| extension == "rs"))
847                .cloned()
848                .collect()
849        })
850        .unwrap_or_default();
851
852    if candidates.is_empty() {
853        candidates.push(agent_root.join("src/main.rs"));
854    }
855
856    for target_file in candidates {
857        let Ok(content) = std::fs::read_to_string(&target_file) else {
858            continue;
859        };
860
861        let replacements = [
862            (
863                "Echo: {}",
864                "Best-effort answer after reasoning: {}",
865                "replace echo fallback format string",
866            ),
867            (
868                "Echo: ",
869                "Best-effort answer after reasoning: ",
870                "replace echo fallback prefix",
871            ),
872        ];
873
874        for (old, new, label) in replacements {
875            if !content.contains(old) {
876                continue;
877            }
878
879            let relative_target = target_file
880                .strip_prefix(agent_root)
881                .unwrap_or(&target_file)
882                .to_path_buf();
883            let patch = generate_preamble_patch(&relative_target, &content, old, new);
884
885            return Ok(Some(ProposedEdit {
886                file: target_file,
887                description: format!("FallbackLogic: {description} ({label})"),
888                patch,
889            }));
890        }
891    }
892
893    Ok(None)
894}
895
896fn select_preamble_target(
897    agent_root: &Path,
898    bundle: Option<&AgentBundle>,
899) -> Option<(PathBuf, String)> {
900    if let Some(prompt) = bundle.and_then(|bundle| bundle.preambles.first()) {
901        return Some((PathBuf::from(&prompt.file), prompt.text.clone()));
902    }
903
904    let target = bundle
905        .and_then(|bundle| {
906            bundle.scope.optimizable_paths.iter().find(|path| {
907                let name = path.file_name().unwrap_or_default().to_string_lossy();
908                name.ends_with(".rs") && (name == "main.rs" || name.contains("agent"))
909            })
910        })
911        .cloned()
912        .unwrap_or_else(|| agent_root.join("src/main.rs"));
913
914    let content = std::fs::read_to_string(&target).ok()?;
915    extract_first_preamble_literal(&content).map(|prompt| (target, prompt))
916}
917
918fn extract_first_preamble_literal(content: &str) -> Option<String> {
919    let marker = ".preamble(\"";
920    let start = content.find(marker)? + marker.len();
921    let rest = &content[start..];
922    let end = rest.find('"')?;
923    Some(rest[..end].to_string())
924}
925
926fn evolved_preamble_for_strategy(
927    old: &str,
928    strategy: &EditStrategy,
929    bundle: Option<&AgentBundle>,
930) -> Option<String> {
931    let addition = match strategy {
932        EditStrategy::SystemPrompt => {
933            "Think step-by-step before answering. Always explain your reasoning in one sentence, then give the final answer."
934        }
935        EditStrategy::FallbackLogic => {
936            "Never echo the user input as the final answer. If uncertain, state assumptions, reason briefly, and provide the best useful answer."
937        }
938        EditStrategy::OutputSchema => {
939            "Always produce an answer, reasoning, and confidence from 0 to 1."
940        }
941        EditStrategy::ToolDescription => {
942            let has_tools = bundle.is_some_and(|bundle| !bundle.tools.is_empty());
943            if !has_tools {
944                return None;
945            }
946            "Before answering, decide whether available tools improve factuality or completeness, and only use them when they add real value."
947        }
948        EditStrategy::ModelConfig => return None,
949    };
950
951    if normalize_prompt(old).contains(&normalize_prompt(addition)) {
952        return Some(old.to_string());
953    }
954
955    let mut base = old.trim().trim_end_matches('.').to_string();
956    if base.is_empty() {
957        base = "You are a concise, helpful assistant".to_string();
958    }
959    Some(format!("{base}. {addition}"))
960}
961
962fn normalize_prompt(value: &str) -> String {
963    value
964        .split_whitespace()
965        .collect::<Vec<_>>()
966        .join(" ")
967        .to_lowercase()
968}
969
970/// Very rough mechanical scorer for the example agent.
971/// Gives higher score if the output is not the echo fallback.
972pub fn mechanical_score(result: &AgentRunResult) -> f32 {
973    let answer = result
974        .output
975        .get("answer")
976        .and_then(|v| v.as_str())
977        .unwrap_or("");
978    let reasoning = result
979        .output
980        .get("reasoning")
981        .and_then(|v| v.as_str())
982        .unwrap_or("");
983
984    if answer.starts_with("Echo:") {
985        return 0.4;
986    }
987
988    let mut score = 0.75f32;
989
990    // Bonus for explicit reasoning language (the improvement the optimizer tries to install)
991    if reasoning.to_lowercase().contains("think")
992        || reasoning.to_lowercase().contains("reason")
993        || reasoning.to_lowercase().contains("step")
994    {
995        score += 0.12;
996    }
997
998    // Bonus for non-trivial answer length
999    if answer.len() > 20 {
1000        score += 0.08;
1001    }
1002
1003    score.min(0.95)
1004}
1005
1006#[cfg(test)]
1007mod tests {
1008    use super::*;
1009    use tempfile::tempdir;
1010
1011    #[test]
1012    fn test_mechanical_score_echo_vs_reasoned() {
1013        let echo = AgentRunResult {
1014            output: serde_json::json!({"answer": "Echo: hello", "reasoning": "no key"}),
1015            duration_ms: 10,
1016            success: true,
1017            error: None,
1018            traces: vec![],
1019        };
1020        let good = AgentRunResult {
1021            output: serde_json::json!({"answer": "The answer is 42 because...", "reasoning": "Think step by step: 6*7"}),
1022            duration_ms: 120,
1023            success: true,
1024            error: None,
1025            traces: vec![],
1026        };
1027
1028        assert!(mechanical_score(&echo) < 0.5);
1029        assert!(mechanical_score(&good) > 0.8);
1030    }
1031
1032    #[test]
1033    fn test_optimize_config_defaults() {
1034        let cfg = OptimizeConfig {
1035            max_iterations: 1,
1036            candidates_per_iteration: 1,
1037            use_llm_judge: false,
1038            budget: OptimizationBudget::Medium,
1039            hook_policy: HookPolicy::default(),
1040            review_before_apply: false,
1041            quiet: false,
1042            candidate_timeout: default_candidate_timeout(),
1043        };
1044        assert_eq!(cfg.max_iterations, 1);
1045    }
1046
1047    #[test]
1048    fn strategy_for_focus_maps_common_candidate_names() {
1049        assert_eq!(
1050            strategy_for_focus("improve tool descriptions"),
1051            EditStrategy::ToolDescription
1052        );
1053        assert_eq!(
1054            strategy_for_focus("fix fallback logic"),
1055            EditStrategy::FallbackLogic
1056        );
1057        assert_eq!(
1058            strategy_for_focus("tighten output schema"),
1059            EditStrategy::OutputSchema
1060        );
1061        assert_eq!(
1062            strategy_for_focus("lower model temperature"),
1063            EditStrategy::ModelConfig
1064        );
1065        assert_eq!(strategy_for_focus("reasoning"), EditStrategy::SystemPrompt);
1066    }
1067
1068    #[test]
1069    fn fallback_candidates_follow_trace_failures() {
1070        let candidates = fallback_candidates_from_trace(&[TraceDiagnosis {
1071            signals: vec![
1072                crate::FailureSignal {
1073                    kind: FailureKind::EchoFallback,
1074                    severity: 2,
1075                    evidence: "Echo: hello".to_string(),
1076                    span_id: None,
1077                },
1078                crate::FailureSignal {
1079                    kind: FailureKind::InvalidJson,
1080                    severity: 2,
1081                    evidence: "raw stdout".to_string(),
1082                    span_id: None,
1083                },
1084            ],
1085            ranked_span_ids: vec![],
1086        }]);
1087
1088        assert_eq!(candidates[0].strategy, Some(EditStrategy::FallbackLogic));
1089        assert!(candidates
1090            .iter()
1091            .any(|candidate| candidate.strategy == Some(EditStrategy::OutputSchema)));
1092    }
1093
1094    #[test]
1095    fn build_edit_for_candidate_creates_schema_preamble_patch() {
1096        let dir = tempdir().unwrap();
1097        let src = dir.path().join("src");
1098        std::fs::create_dir_all(&src).unwrap();
1099        let main = src.join("main.rs");
1100        std::fs::write(
1101            &main,
1102            r#"fn main() { let _agent = client.agent("m").preamble("You are helpful.").build(); }"#,
1103        )
1104        .unwrap();
1105
1106        let candidate = Candidate {
1107            focus: "output_schema".to_string(),
1108            description: "make output contract explicit".to_string(),
1109            expected_improvement: "more parseable output".to_string(),
1110            strategy: Some(EditStrategy::OutputSchema),
1111        };
1112
1113        let edit = build_edit_for_candidate(dir.path(), None, &candidate)
1114            .unwrap()
1115            .expect("schema strategy should produce a prompt edit");
1116
1117        assert_eq!(edit.file, main);
1118        assert!(edit.patch.contains("answer, reasoning, and confidence"));
1119    }
1120
1121    #[test]
1122    fn tool_strategy_requires_discovered_tools() {
1123        let dir = tempdir().unwrap();
1124        let src = dir.path().join("src");
1125        std::fs::create_dir_all(&src).unwrap();
1126        let main = src.join("main.rs");
1127        std::fs::write(
1128            &main,
1129            r#"fn main() { let _agent = client.agent("m").preamble("You are helpful.").build(); }"#,
1130        )
1131        .unwrap();
1132
1133        let candidate = Candidate {
1134            focus: "tool_description".to_string(),
1135            description: "clarify tool use".to_string(),
1136            expected_improvement: "better tool calls".to_string(),
1137            strategy: Some(EditStrategy::ToolDescription),
1138        };
1139
1140        let without_tools = build_edit_for_candidate(dir.path(), None, &candidate).unwrap();
1141        assert!(without_tools.is_none());
1142
1143        let bundle = AgentBundle {
1144            scope: mdx_rust_analysis::BundleScope {
1145                optimizable_paths: vec![main],
1146                read_only_paths: vec![],
1147            },
1148            preambles: vec![],
1149            tools: vec![mdx_rust_analysis::ExtractedTool {
1150                file: "src/main.rs".to_string(),
1151                name: "search".to_string(),
1152                description: None,
1153            }],
1154            is_rig_agent: true,
1155            key_files: vec![],
1156        };
1157
1158        let with_tools = build_edit_for_candidate(dir.path(), Some(&bundle), &candidate)
1159            .unwrap()
1160            .expect("tool strategy should produce a prompt edit when tools exist");
1161        assert!(with_tools
1162            .patch
1163            .contains("available tools improve factuality"));
1164    }
1165
1166    #[test]
1167    fn fallback_logic_strategy_can_patch_echo_fallback() {
1168        let dir = tempdir().unwrap();
1169        let src = dir.path().join("src");
1170        std::fs::create_dir_all(&src).unwrap();
1171        let main = src.join("main.rs");
1172        std::fs::write(
1173            &main,
1174            r#"fn main() { println!("{}", format!("Echo: {}", "hello")); }"#,
1175        )
1176        .unwrap();
1177
1178        let candidate = Candidate {
1179            focus: "fallback_logic".to_string(),
1180            description: "avoid echo fallback".to_string(),
1181            expected_improvement: "more useful fallback".to_string(),
1182            strategy: Some(EditStrategy::FallbackLogic),
1183        };
1184
1185        let edit = build_edit_for_candidate(dir.path(), None, &candidate)
1186            .unwrap()
1187            .expect("fallback logic should patch simple echo fallback");
1188
1189        assert_eq!(edit.file, main);
1190        assert!(edit.patch.contains("Best-effort answer after reasoning"));
1191    }
1192}
mdx_rust_core/optimizer.rs

mdx_rust_core/
optimizer.rs