ripmap 0.1.0 - Docs.rs

//! Benchmark CLI for hyperparameter optimization.
//!
//! # Usage
//!
//! ```bash
//! # Single repo for testing
//! ripmap-train --repo /path/to/repo --budget 50
//!
//! # Quick corpus (5 repos) for dev iteration
//! ripmap-train --corpus quick --strategy bayesian --budget 100
//!
//! # Full curated corpus for training
//! ripmap-train --corpus curated --strategy bayesian --budget 500
//!
//! # Reasoning-based training (uses LLM to analyze failures)
//! ripmap-train --corpus curated --reason --prompt training-outer/prompts/inner/v001.md --episodes 50
//!
//! # Sensitivity analysis on best config
//! ripmap-train --repo /path/to/repo --sensitivity --config best_config.json
//! ```
//!
//! # Output
//!
//! The benchmark produces:
//! - `results.json`: Full results with all parameter evaluations
//! - `best_config.json`: Optimal RankingConfig for production use
//! - `sensitivity.json`: Parameter importance analysis
//! - Console summary with top configurations and insights

use std::fs::File;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;

use chrono::{DateTime, Local, TimeZone};
use clap::Parser;
use rand::SeedableRng;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};

use ripmap::training::{
    // Reasoning-based training
    Agent,
    CURATED_REPOS,
    CaseMetrics,
    EvalMetrics,
    ParameterGrid,
    ParameterPoint,
    RankingFailure,
    RepoSpec,
    Scratchpad,
    SearchStrategy,
    SensitivityAnalysis,
    WeightedCase,
    apply_changes,
    bayesian_next_sample,
    compute_coupling_weights,
    distill_scratchpad,
    extract_cases,
    full_analysis,
    print_scratchpad_summary,
    print_summary,
    quick_repos,
    reason_about_failures,
    sample_points,
    update_scratchpad,
    weight_cases,
};

#[derive(Parser, Debug)]
#[command(name = "ripmap-bench")]
#[command(about = "Hyperparameter optimization for ripmap ranking")]
struct Args {
    /// Path to a single repository to benchmark
    #[arg(long)]
    repo: Option<PathBuf>,

    /// Corpus to use: quick (5 repos), curated (full set), or a path to custom list
    /// Examples: --corpus quick, --corpus curated, --corpus ./my_repos.txt
    #[arg(long)]
    corpus: Option<String>,

    /// Search strategy: grid, lhs, random, bayesian
    #[arg(long, default_value = "lhs")]
    strategy: String,

    /// Number of parameter configurations to evaluate
    #[arg(long, default_value = "100")]
    budget: usize,

    /// Random seed for reproducibility
    #[arg(long, default_value = "42")]
    seed: u64,

    /// Output file for results
    #[arg(long, default_value = "training/runs/default/results.json")]
    output: PathBuf,

    /// Run sensitivity analysis after optimization
    #[arg(long)]
    sensitivity: bool,

    /// Load existing config for sensitivity analysis
    #[arg(long)]
    config: Option<PathBuf>,

    /// Only extract training cases, don't optimize
    #[arg(long)]
    extract_only: bool,

    /// Maximum commits to analyze per repo
    #[arg(long, default_value = "500")]
    max_commits: usize,

    /// Minimum files per commit for training case
    #[arg(long, default_value = "2")]
    min_files: usize,

    /// Maximum files per commit for training case
    #[arg(long, default_value = "12")]
    max_files: usize,

    /// Directory to clone curated repos into
    #[arg(long, default_value = "./training/corpus")]
    clone_dir: PathBuf,

    /// Verbose output
    #[arg(short, long)]
    verbose: bool,

    /// Path to semantic distractors JSON (generated by Claude)
    /// Format: [{"repo": "name", "cases": [{"seed_file": "...", "semantic_distractors": [...]}]}]
    #[arg(long)]
    distractors: Option<PathBuf>,

    // === Reasoning-Based Training ===
    /// Path to prompt template file (required for --reason mode)
    /// Contains placeholders: {current_ndcg:.4}, {episode_num}, {episode_history}, {params_desc}, {failure_desc}
    #[arg(long)]
    prompt: Option<PathBuf>,

    /// Use reasoning-based training via Claude (universal function approximator)
    /// Instead of blind parameter search, Claude analyzes WHY rankings fail
    #[arg(long)]
    reason: bool,

    /// Number of reasoning episodes to run
    #[arg(long, default_value = "10")]
    episodes: usize,

    /// NDCG threshold below which a case is considered a failure
    #[arg(long, default_value = "0.5")]
    failure_threshold: f64,

    /// Path to scratchpad file (accumulated insights)
    #[arg(long, default_value = "training/runs/default/scratchpad.json")]
    scratchpad: PathBuf,

    /// Distill scratchpad into operator wisdom (presets, heuristics, warnings)
    #[arg(long)]
    distill: bool,

    /// Generate training progress chart (requires --features plotters)
    #[arg(long)]
    plot: Option<PathBuf>,

    /// LLM agent to use for reasoning: 'claude' (default), 'gemini', or 'codex'
    #[arg(long, default_value = "claude")]
    agent: String,

    /// Model to use for the agent (e.g., 'opus', 'o3', 'gemini-2.0-flash')
    /// Defaults: claude=sonnet, gemini=gemini-2.0-flash, codex=o3
    #[arg(long, short = 'm')]
    model: Option<String>,

    /// Save scratchpad/params after every N episodes (for crash recovery)
    /// Default: 1 (save every episode)
    #[arg(long, default_value = "1")]
    save_interval: usize,

    /// Name for this training run (creates training/runs/<name>/)
    /// If provided, overrides --output, --scratchpad, --plot paths
    #[arg(long)]
    run_name: Option<String>,

    /// Show a rich visualization of a past training run
    /// Usage: --show <run-name> (looks in training/runs/<run-name>/)
    /// Or: --show <path-to-scratchpad.json>
    #[arg(long)]
    show: Option<String>,

    /// Pivot view: show structural insights as primary axis (insight-first, not episode-first)
    #[arg(long)]
    show_insights: Option<String>,

    /// Pivot view: show parameter interactions as primary axis
    #[arg(long)]
    show_interactions: Option<String>,

    /// List all available training runs
    #[arg(long)]
    list: bool,
}

/// Semantic distractors generated by Claude.
/// These are plausible-but-wrong file paths that create a realistic ranking challenge.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct SemanticDistractorCase {
    seed_file: String,
    #[serde(default)]
    expected_related: Vec<String>,
    #[serde(default)]
    commit_message: String,
    semantic_distractors: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
struct SemanticDistractorRepo {
    repo: String,
    n_cases: usize,
    cases: Vec<SemanticDistractorCase>,
}

/// Lookup table: seed_file -> semantic distractors
type DistractorLookup = std::collections::HashMap<String, Vec<String>>;

fn load_distractors(path: &Path) -> anyhow::Result<DistractorLookup> {
    let file = File::open(path)?;
    let repos: Vec<SemanticDistractorRepo> = serde_json::from_reader(file)?;

    let mut lookup = DistractorLookup::new();
    for repo in repos {
        for case in repo.cases {
            lookup.insert(case.seed_file, case.semantic_distractors);
        }
    }

    println!("Loaded {} semantic distractor entries", lookup.len());
    Ok(lookup)
}

/// Full benchmark results.
#[derive(Debug, Serialize, Deserialize)]
struct BenchmarkResults {
    /// All evaluated configurations with scores
    evaluations: Vec<(ParameterPoint, EvalMetrics)>,

    /// Best configuration found
    best_config: ParameterPoint,
    best_score: f64,

    /// Sensitivity analysis (if run)
    sensitivity: Option<SensitivityAnalysis>,

    /// Metadata
    n_cases: usize,
    n_repos: usize,
    total_time_secs: f64,
    strategy: String,
}

/// Summary info for a training run, used for sorting and display.
struct RunInfo {
    name: String,
    episodes: usize,
    first_ndcg: f64,
    last_ndcg: f64,
    delta: f64,
    /// First episode timestamp (for sorting by start date)
    start_ts: i64,
    /// Last episode timestamp (for end date display)
    end_ts: i64,
}

/// Format a Unix timestamp as a human-readable date string.
fn format_timestamp(ts: i64) -> String {
    if ts == 0 {
        return "—".to_string();
    }
    Local
        .timestamp_opt(ts, 0)
        .single()
        .map(|dt: DateTime<Local>| dt.format("%b %d %H:%M").to_string())
        .unwrap_or_else(|| "—".to_string())
}

/// List all available training runs with summary stats, sorted by date.
fn list_training_runs() -> anyhow::Result<()> {
    let runs_dir = PathBuf::from("training/runs");
    if !runs_dir.exists() {
        println!("No training runs found. Run with --run-name to create one.");
        return Ok(());
    }

    // Collect run info with timestamps
    let mut runs: Vec<RunInfo> = Vec::new();
    let mut empty_runs: Vec<(String, i64)> = Vec::new(); // (name, mtime) for runs with no data

    for entry in std::fs::read_dir(&runs_dir)?.filter_map(|e| e.ok()) {
        if !entry.path().is_dir() {
            continue;
        }
        let name = entry.file_name().to_string_lossy().to_string();
        let scratchpad_path = entry.path().join("scratchpad.json");

        // Fallback: use file modification time if no episode timestamps
        let file_mtime = scratchpad_path
            .metadata()
            .and_then(|m| m.modified())
            .ok()
            .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
            .map(|d| d.as_secs() as i64)
            .unwrap_or(0);

        if scratchpad_path.exists() {
            if let Ok(file) = File::open(&scratchpad_path) {
                if let Ok(sp) = serde_json::from_reader::<_, Scratchpad>(file) {
                    let eps = sp.episodes.len();
                    if eps > 0 {
                        let first_ndcg = sp.episodes.first().map(|e| e.ndcg_before).unwrap_or(0.0);
                        let last_ndcg = sp.episodes.last().map(|e| e.ndcg_before).unwrap_or(0.0);
                        let delta = last_ndcg - first_ndcg;

                        // Get timestamps from episodes (fall back to file mtime if 0)
                        let start_ts = sp
                            .episodes
                            .first()
                            .map(|e| {
                                if e.timestamp > 0 {
                                    e.timestamp
                                } else {
                                    file_mtime
                                }
                            })
                            .unwrap_or(file_mtime);
                        let end_ts = sp
                            .episodes
                            .last()
                            .map(|e| {
                                if e.timestamp > 0 {
                                    e.timestamp
                                } else {
                                    file_mtime
                                }
                            })
                            .unwrap_or(file_mtime);

                        runs.push(RunInfo {
                            name,
                            episodes: eps,
                            first_ndcg,
                            last_ndcg,
                            delta,
                            start_ts,
                            end_ts,
                        });
                        continue;
                    }
                }
            }
        }
        empty_runs.push((name, file_mtime));
    }

    // Sort by start timestamp (oldest first, so most recent at bottom)
    runs.sort_by_key(|r| r.start_ts);
    empty_runs.sort_by_key(|(_, ts)| *ts);

    // Build table without box drawing for cleaner output
    use owo_colors::OwoColorize;

    println!();
    println!("{}", "TRAINING RUNS".bold());
    println!("{}", "─".repeat(90));
    println!(
        "{:26} {:>4}  {:^17}  {:>8}   {:>24}",
        "NAME", "EPS", "NDCG TRAJECTORY", "DELTA", "STARTED -> LAST"
    );
    println!("{}", "─".repeat(90));

    for run in &runs {
        let (trend, delta_colored) = if run.delta > 0.01 {
            ("+", format!("{:>+8.4}", run.delta).green().to_string())
        } else if run.delta < -0.01 {
            ("-", format!("{:>+8.4}", run.delta).red().to_string())
        } else {
            ("=", format!("{:>+8.4}", run.delta).dimmed().to_string())
        };
        let start_str = format_timestamp(run.start_ts);
        let end_str = format_timestamp(run.end_ts);
        let time_range = format!("{} -> {}", start_str, end_str);

        println!(
            "{:26} {:>4}  {:.3} {} {:.3}  {}   {}",
            run.name,
            run.episodes,
            run.first_ndcg,
            trend,
            run.last_ndcg,
            delta_colored,
            time_range.dimmed()
        );
    }

    // Show empty runs at the bottom
    for (name, mtime) in &empty_runs {
        let ts_str = format_timestamp(*mtime);
        println!(
            "{:26}    -  (no data)                          {:>24}",
            name, ts_str
        );
    }

    println!("{}", "─".repeat(90));
    println!("\nUse --show <run-name> to see detailed visualization.\n");
    Ok(())
}

/// Rich text visualization of a past training run.
/// Shows the full optimization journey: trajectory, strategies, insights.
fn show_training_run(path: &str) -> anyhow::Result<()> {
    use comfy_table::{Cell, ContentArrangement, Table, presets::UTF8_FULL_CONDENSED};
    use owo_colors::OwoColorize;

    // Resolve path: either a run name or direct path to scratchpad
    let scratchpad_path = if path.ends_with(".json") {
        PathBuf::from(path)
    } else {
        PathBuf::from(format!("training/runs/{}/scratchpad.json", path))
    };

    if !scratchpad_path.exists() {
        anyhow::bail!("Scratchpad not found: {}", scratchpad_path.display());
    }

    let file = File::open(&scratchpad_path)?;
    let scratchpad: Scratchpad = serde_json::from_reader(file)?;

    if scratchpad.episodes.is_empty() {
        println!("No episodes found in scratchpad.");
        return Ok(());
    }

    // Header
    let run_name = scratchpad_path
        .parent()
        .and_then(|p| p.file_name())
        .map(|s| s.to_string_lossy().to_string())
        .unwrap_or_else(|| "unknown".to_string());

    println!();
    println!(
        "{}",
        format!(" TRAINING RUN: {} ", run_name).bold().on_blue()
    );
    println!("  Episodes: {}", scratchpad.episodes.len());
    println!();

    // NDCG trajectory sparkline
    let ndcgs: Vec<f64> = scratchpad.episodes.iter().map(|e| e.ndcg_before).collect();
    let min_ndcg = ndcgs.iter().cloned().fold(f64::INFINITY, f64::min);
    let max_ndcg = ndcgs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
    let range = (max_ndcg - min_ndcg).max(0.001);

    let spark_chars = ['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█'];
    let sparkline: String = ndcgs
        .iter()
        .map(|&n| {
            let normalized = ((n - min_ndcg) / range * 7.0).round() as usize;
            spark_chars[normalized.min(7)]
        })
        .collect();

    let delta = ndcgs.last().unwrap_or(&0.0) - ndcgs.first().unwrap_or(&0.0);
    let delta_str = if delta > 0.0 {
        format!("{:+.4}", delta).green().to_string()
    } else if delta < 0.0 {
        format!("{:+.4}", delta).red().to_string()
    } else {
        format!("{:+.4}", delta).dimmed().to_string()
    };

    println!("{}", "NDCG TRAJECTORY".bold());
    println!(
        "  {:.3} {} {:.3}  Δ = {}",
        min_ndcg,
        sparkline.cyan(),
        max_ndcg,
        delta_str
    );
    println!();

    // Episode-by-episode breakdown as a table
    println!("{}", "EPISODES".bold());

    let mut table = Table::new();
    table.load_preset(UTF8_FULL_CONDENSED);
    table.set_content_arrangement(ContentArrangement::Dynamic);
    table.set_header(vec![
        "#",
        "Trend",
        "NDCG",
        "Fail",
        "Conf",
        "Strategy / Changes / Insight",
    ]);

    for (i, ep) in scratchpad.episodes.iter().enumerate() {
        let ep_num = i + 1;
        let trend = if i == 0 {
            "·".to_string()
        } else {
            let prev = scratchpad.episodes[i - 1].ndcg_before;
            if ep.ndcg_before > prev + 0.005 {
                "↗".green().to_string()
            } else if ep.ndcg_before < prev - 0.005 {
                "↘".red().to_string()
            } else {
                "→".dimmed().to_string()
            }
        };

        // Build the description column with strategy, changes, insight
        let mut desc_parts: Vec<String> = Vec::new();

        // Strategy capsule
        if !ep.strategy_capsule.is_empty() {
            let capsule = if ep.strategy_capsule.len() > 60 {
                format!("⟨{}...⟩", &ep.strategy_capsule[..57])
            } else {
                format!("⟨{}⟩", &ep.strategy_capsule)
            };
            desc_parts.push(capsule);
        }

        // Parameter changes (compact format)
        if !ep.proposed_changes.is_empty() {
            let changes: Vec<String> = ep
                .proposed_changes
                .iter()
                .map(|(k, (dir, mag, _))| {
                    let arrow = if dir == "increase" { "↑" } else { "↓" };
                    let mag_short = mag
                        .chars()
                        .next()
                        .map(|c| c.to_uppercase().to_string())
                        .unwrap_or_default();
                    format!("{}{}{}", k, arrow, mag_short)
                })
                .collect();
            desc_parts.push(changes.join(", "));
        }

        // Key insight from this episode
        if !ep.structural_insights.is_empty() {
            let insight = &ep.structural_insights[0];
            let truncated = if insight.len() > 55 {
                format!("💡 {}...", &insight[..52])
            } else {
                format!("💡 {}", insight)
            };
            desc_parts.push(truncated);
        }

        table.add_row(vec![
            Cell::new(format!("E{:02}", ep_num)),
            Cell::new(&trend),
            Cell::new(format!("{:.4}", ep.ndcg_before)),
            Cell::new(format!("{}", ep.failures.len())),
            Cell::new(format!("{:.2}", ep.confidence)),
            Cell::new(desc_parts.join("\n")),
        ]);
    }

    println!("{table}");
    println!();

    // Final parameters (from last episode) as a compact table
    if let Some(last_ep) = scratchpad.episodes.last() {
        println!("{}", "FINAL PARAMETERS".bold());

        let mut ptable = Table::new();
        ptable.load_preset(UTF8_FULL_CONDENSED);
        ptable.set_content_arrangement(ContentArrangement::Dynamic);
        ptable.set_header(vec!["Category", "Parameters"]);

        let p = &last_ep.params;
        ptable.add_row(vec![
            "PageRank",
            &format!(
                "α={:.3}  chat_mult={:.1}",
                p.pagerank_alpha, p.pagerank_chat_multiplier
            ),
        ]);
        ptable.add_row(vec![
            "Depth",
            &format!(
                "root={:.2}  mod={:.2}  deep={:.2}  vendor={:.4}",
                p.depth_weight_root,
                p.depth_weight_moderate,
                p.depth_weight_deep,
                p.depth_weight_vendor
            ),
        ]);
        ptable.add_row(vec![
            "Boosts",
            &format!(
                "ident={:.1}  file={:.1}  chat={:.1}  temp={:.2}  focus={:.2}",
                p.boost_mentioned_ident,
                p.boost_mentioned_file,
                p.boost_chat_file,
                p.boost_temporal_coupling,
                p.boost_focus_expansion
            ),
        ]);
        ptable.add_row(vec![
            "Git",
            &format!(
                "decay={:.0}d  recency_max={:.1}  churn_th={:.0}  churn_max={:.1}",
                p.git_recency_decay_days,
                p.git_recency_max_boost,
                p.git_churn_threshold,
                p.git_churn_max_boost
            ),
        ]);
        ptable.add_row(vec![
            "Focus",
            &format!(
                "decay={:.2}  max_hops={:.0}",
                p.focus_decay, p.focus_max_hops
            ),
        ]);

        println!("{ptable}");
        println!();
    }

    Ok(())
}

/// Resolve a run name or path to a scratchpad file path.
fn resolve_scratchpad_path(path: &str) -> PathBuf {
    if path.ends_with(".json") {
        PathBuf::from(path)
    } else {
        PathBuf::from(format!("training/runs/{}/scratchpad.json", path))
    }
}

/// Pivot view: insights as primary axis.
/// Shows each unique insight and which episodes produced it.
fn show_insights_pivot(path: &str) -> anyhow::Result<()> {
    use std::collections::HashMap;

    let scratchpad_path = resolve_scratchpad_path(path);
    if !scratchpad_path.exists() {
        anyhow::bail!("Scratchpad not found: {}", scratchpad_path.display());
    }

    let file = File::open(&scratchpad_path)?;
    let scratchpad: Scratchpad = serde_json::from_reader(file)?;

    // Build insight -> [episode indices] map
    let mut insight_episodes: HashMap<String, Vec<usize>> = HashMap::new();
    for (i, ep) in scratchpad.episodes.iter().enumerate() {
        for insight in &ep.structural_insights {
            insight_episodes
                .entry(insight.clone())
                .or_default()
                .push(i + 1);
        }
    }

    // Sort by frequency (most common first)
    let mut sorted: Vec<_> = insight_episodes.iter().collect();
    sorted.sort_by(|a, b| b.1.len().cmp(&a.1.len()));

    println!("\nSTRUCTURAL INSIGHTS (pivot view)");
    println!("─────────────────────────────────────────────────────────────────────────────────");
    println!(
        "{} unique insights from {} episodes\n",
        sorted.len(),
        scratchpad.episodes.len()
    );

    for (insight, episodes) in sorted.iter().take(30) {
        let ep_list = if episodes.len() <= 5 {
            episodes
                .iter()
                .map(|e| e.to_string())
                .collect::<Vec<_>>()
                .join(", ")
        } else {
            format!(
                "{}, ... (+{} more)",
                episodes[..3]
                    .iter()
                    .map(|e| e.to_string())
                    .collect::<Vec<_>>()
                    .join(", "),
                episodes.len() - 3
            )
        };
        println!("[E{}] {}", ep_list, insight);
        println!();
    }

    if sorted.len() > 30 {
        println!("... and {} more insights", sorted.len() - 30);
    }

    Ok(())
}

/// Pivot view: parameter interactions as primary axis.
/// Shows each unique interaction and which episodes discovered it.
fn show_interactions_pivot(path: &str) -> anyhow::Result<()> {
    use std::collections::HashMap;

    let scratchpad_path = resolve_scratchpad_path(path);
    if !scratchpad_path.exists() {
        anyhow::bail!("Scratchpad not found: {}", scratchpad_path.display());
    }

    let file = File::open(&scratchpad_path)?;
    let scratchpad: Scratchpad = serde_json::from_reader(file)?;

    // Build interaction -> [episode indices] map
    let mut interaction_episodes: HashMap<String, Vec<usize>> = HashMap::new();
    for (i, ep) in scratchpad.episodes.iter().enumerate() {
        for interaction in &ep.param_interactions {
            interaction_episodes
                .entry(interaction.clone())
                .or_default()
                .push(i + 1);
        }
    }

    // Sort by frequency (most common first)
    let mut sorted: Vec<_> = interaction_episodes.iter().collect();
    sorted.sort_by(|a, b| b.1.len().cmp(&a.1.len()));

    println!("\nPARAMETER INTERACTIONS (pivot view)");
    println!("─────────────────────────────────────────────────────────────────────────────────");
    println!(
        "{} unique interactions from {} episodes\n",
        sorted.len(),
        scratchpad.episodes.len()
    );

    for (interaction, episodes) in sorted.iter().take(30) {
        let ep_list = if episodes.len() <= 5 {
            episodes
                .iter()
                .map(|e| e.to_string())
                .collect::<Vec<_>>()
                .join(", ")
        } else {
            format!(
                "{}, ... (+{} more)",
                episodes[..3]
                    .iter()
                    .map(|e| e.to_string())
                    .collect::<Vec<_>>()
                    .join(", "),
                episodes.len() - 3
            )
        };
        println!("[E{}] {}", ep_list, interaction);
        println!();
    }

    if sorted.len() > 30 {
        println!("... and {} more interactions", sorted.len() - 30);
    }

    Ok(())
}

fn main() -> anyhow::Result<()> {
    let mut args = Args::parse();

    // Handle --list: show available training runs and exit
    if args.list {
        return list_training_runs();
    }

    // Handle --show: visualize a past training run and exit
    if let Some(ref path) = args.show {
        return show_training_run(path);
    }

    // Handle --show-insights: pivot view with insights as primary axis
    if let Some(ref path) = args.show_insights {
        return show_insights_pivot(path);
    }

    // Handle --show-interactions: pivot view with interactions as primary axis
    if let Some(ref path) = args.show_interactions {
        return show_interactions_pivot(path);
    }

    // Handle --run-name: create run directory and override paths
    if let Some(ref name) = args.run_name {
        let run_dir = PathBuf::from(format!("training/runs/{}", name));
        std::fs::create_dir_all(&run_dir)?;
        std::fs::create_dir_all(run_dir.join("checkpoints"))?;

        args.output = run_dir.join("results.json");
        args.scratchpad = run_dir.join("scratchpad.json");
        if args.plot.is_none() {
            args.plot = Some(run_dir.join("progress.png"));
        }

        println!("Training run: {}", name);
        println!("  Output dir: {}", run_dir.display());
    }

    // Determine repos to use
    let repos: Vec<PathBuf> = if let Some(ref repo) = args.repo {
        vec![repo.clone()]
    } else if let Some(ref corpus) = args.corpus {
        let specs = match corpus.as_str() {
            "quick" => quick_repos(),
            "curated" => CURATED_REPOS.iter().collect(),
            // Future: could load from file path
            other => {
                eprintln!(
                    "Error: Unknown corpus '{}'. Use 'quick' or 'curated'",
                    other
                );
                std::process::exit(1);
            }
        };
        ensure_repos_cloned(&specs, &args.clone_dir)?
    } else {
        eprintln!("Error: Specify --repo or --corpus");
        std::process::exit(1);
    };

    println!("Benchmarking {} repositories", repos.len());

    // Extract training cases from all repos
    let start = Instant::now();
    let mut all_cases: Vec<WeightedCase> = Vec::new();

    for repo_path in &repos {
        println!("\nProcessing {}...", repo_path.display());

        let cases = extract_cases(repo_path, args.max_commits, args.min_files, args.max_files);
        println!("  Extracted {} raw cases", cases.len());

        let coupling = compute_coupling_weights(repo_path, args.max_commits);
        println!("  Computed {} coupling pairs", coupling.len());

        let weighted = weight_cases(cases, &coupling);
        println!("  Weighted cases: {}", weighted.len());

        all_cases.extend(weighted);
    }

    println!("\nTotal training cases: {}", all_cases.len());

    // Load semantic distractors if provided
    let distractors: Option<Arc<DistractorLookup>> = if let Some(ref path) = args.distractors {
        match load_distractors(path) {
            Ok(lookup) => Some(Arc::new(lookup)),
            Err(e) => {
                eprintln!(
                    "Warning: Failed to load distractors: {}. Using synthetic.",
                    e
                );
                None
            }
        }
    } else {
        None
    };

    if args.extract_only {
        // Just save cases and exit
        let file = File::create(&args.output)?;
        serde_json::to_writer_pretty(file, &all_cases)?;
        println!("Saved cases to {}", args.output.display());
        return Ok(());
    }

    if all_cases.is_empty() {
        eprintln!("Error: No training cases extracted. Check repository paths.");
        std::process::exit(1);
    }

    // === REASONING-BASED TRAINING ===
    // Uses Claude as universal function approximator to understand WHY rankings fail
    if args.reason {
        return run_reasoning_training(&args, &all_cases, distractors.as_deref());
    }

    // === DISTILLATION ===
    // Crystallize accumulated insights into operator wisdom
    if args.distill {
        return run_distillation(&args);
    }

    // === CLASSICAL OPTIMIZATION ===
    // Parse search strategy
    let strategy = match args.strategy.as_str() {
        "grid" => SearchStrategy::Grid { points_per_dim: 3 },
        "lhs" => SearchStrategy::LatinHypercube,
        "random" => SearchStrategy::Random,
        "bayesian" => SearchStrategy::Bayesian,
        _ => {
            eprintln!("Unknown strategy: {}. Using LHS.", args.strategy);
            SearchStrategy::LatinHypercube
        }
    };

    // Generate initial parameter points
    let grid = ParameterGrid::default();
    let mut points = sample_points(&grid, strategy, args.budget, args.seed);

    use indicatif::{ProgressBar, ProgressStyle};

    // Shared case data for parallel evaluation
    let cases = Arc::new(all_cases);
    let distractors_ref = distractors.clone();

    // Evaluate all points in parallel with progress bar
    let pb = ProgressBar::new(points.len() as u64);
    pb.set_style(
        ProgressStyle::with_template(
            "{prefix:.bold} {bar:40.cyan/dim} {pos}/{len} [{elapsed}<{eta}] {msg}",
        )
        .unwrap(),
    );
    pb.set_prefix("Evaluating");

    let best_ndcg = std::sync::atomic::AtomicU64::new(0);
    let evaluations: Vec<(ParameterPoint, EvalMetrics)> = points
        .par_iter()
        .map(|point| {
            let metrics = evaluate_point(point, &cases, distractors_ref.as_deref());
            // Track best score seen so far (atomic for thread safety)
            let current = (metrics.ndcg_at_10 * 10000.0) as u64;
            best_ndcg.fetch_max(current, std::sync::atomic::Ordering::Relaxed);
            let best = best_ndcg.load(std::sync::atomic::Ordering::Relaxed) as f64 / 10000.0;
            pb.set_message(format!("best={:.4}", best));
            pb.inc(1);
            (point.clone(), metrics)
        })
        .collect();
    pb.finish_with_message(format!(
        "best={:.4}",
        best_ndcg.load(std::sync::atomic::Ordering::Relaxed) as f64 / 10000.0
    ));

    // For Bayesian strategy, do iterative refinement
    let mut evaluations = evaluations; // make mutable for Bayesian
    if matches!(strategy, SearchStrategy::Bayesian) && evaluations.len() < args.budget {
        let remaining = args.budget - evaluations.len();
        let pb = ProgressBar::new(remaining as u64);
        pb.set_style(
            ProgressStyle::with_template(
                "{prefix:.bold} {bar:40.green/dim} {pos}/{len} [{elapsed}<{eta}] {msg}",
            )
            .unwrap(),
        );
        pb.set_prefix("Bayesian");

        let mut rng = rand::rngs::StdRng::seed_from_u64(args.seed);
        let history: Vec<_> = evaluations
            .iter()
            .map(|(p, m)| (p.clone(), m.ndcg_at_10))
            .collect();

        let mut best_so_far = evaluations
            .iter()
            .map(|(_, m)| m.ndcg_at_10)
            .fold(0.0_f64, f64::max);

        for _ in evaluations.len()..args.budget {
            let next = bayesian_next_sample(&grid, &history, &mut rng);
            let metrics = evaluate_point(&next, &cases, distractors.as_deref());
            if metrics.ndcg_at_10 > best_so_far {
                best_so_far = metrics.ndcg_at_10;
            }
            pb.set_message(format!("best={:.4}", best_so_far));
            pb.inc(1);
            evaluations.push((next, metrics));
        }
        pb.finish_with_message(format!("best={:.4}", best_so_far));
    }

    // Find best configuration
    let (best_config, best_metrics) = evaluations
        .iter()
        .max_by(|a, b| a.1.ndcg_at_10.partial_cmp(&b.1.ndcg_at_10).unwrap())
        .cloned()
        .expect("No evaluations");

    let elapsed = start.elapsed().as_secs_f64();

    println!("\n=== Results ===\n");
    println!("Best NDCG@10: {:.4}", best_metrics.ndcg_at_10);
    println!("Best NDCG@5:  {:.4}", best_metrics.ndcg_at_5);
    println!("Best MRR:     {:.4}", best_metrics.mrr);
    println!("Best P@10:    {:.4}", best_metrics.precision_at_10);
    println!("\nTotal time: {:.1}s", elapsed);

    // Print best config
    println!("\n=== Best Configuration ===\n");
    print_config(&best_config);

    // Sensitivity analysis if requested
    let sensitivity = if args.sensitivity {
        println!("\n=== Running Sensitivity Analysis ===\n");
        let distractors_ref = distractors.as_deref();
        let evaluator = |p: &ParameterPoint| evaluate_point(p, &cases, distractors_ref).ndcg_at_10;
        let analysis = full_analysis(&best_config, evaluator);
        print_summary(&analysis);
        Some(analysis)
    } else {
        None
    };

    // Save results
    let results = BenchmarkResults {
        evaluations: evaluations.clone(),
        best_config: best_config.clone(),
        best_score: best_metrics.ndcg_at_10,
        sensitivity,
        n_cases: cases.len(),
        n_repos: repos.len(),
        total_time_secs: elapsed,
        strategy: args.strategy,
    };

    let file = File::create(&args.output)?;
    serde_json::to_writer_pretty(file, &results)?;
    println!("\nResults saved to {}", args.output.display());

    // Also save just the best config for easy loading
    let config_path = args.output.with_extension("best.json");
    let config_file = File::create(&config_path)?;
    serde_json::to_writer_pretty(config_file, &best_config)?;
    println!("Best config saved to {}", config_path.display());

    Ok(())
}

/// Evaluate a parameter point against training cases.
fn evaluate_point(
    point: &ParameterPoint,
    cases: &[WeightedCase],
    distractors: Option<&DistractorLookup>,
) -> EvalMetrics {
    // For each case, simulate running ripmap with this config
    // and compute metrics against ground truth

    let per_case: Vec<CaseMetrics> = cases
        .iter()
        .map(|case| {
            // Simulate ranking: use coupling weights as proxy
            // (In full implementation, would actually run the ranking pipeline)
            let ranking = simulate_ranking(point, case, distractors);

            // Ground truth
            let ground_truth: Vec<_> = case.expected_related.iter().cloned().collect();

            CaseMetrics::compute(&ranking, &ground_truth, 0.1)
        })
        .collect();

    // Weight by case quality
    let weighted: Vec<_> = per_case
        .iter()
        .zip(cases.iter())
        .map(|(m, c)| (m.clone(), c.case_weight))
        .collect();

    EvalMetrics::aggregate_weighted(&weighted)
}

/// Simulate ranking for a case with given parameters.
///
/// CRITICAL: This simulation includes **distractors** - files that
/// are NOT in ground truth but compete for ranking. This tests whether the
/// parameters can distinguish signal (coupled files) from noise.
///
/// Two modes:
/// 1. Semantic distractors (Claude-generated): Plausible-but-wrong files
///    that share keywords and structure with ground truth. Realistic challenge.
/// 2. Synthetic distractors (fallback): Simple generated paths like
///    "src/distractor_0.rs". Easy baseline.
///
/// The challenge: ground truth files have coupling > 0, distractors have 0.
/// Good configs should rank ground truth above distractors.
fn simulate_ranking(
    point: &ParameterPoint,
    case: &WeightedCase,
    distractors: Option<&DistractorLookup>,
) -> Vec<String> {
    use rand::Rng;
    let mut rng = rand::thread_rng();

    // Score ground truth files (have coupling signal)
    let mut scored: Vec<_> = case
        .expected_related
        .iter()
        .map(|(file, coupling_weight)| {
            let score = score_file(point, file, *coupling_weight, &mut rng);
            (file.clone(), score, true) // true = is ground truth
        })
        .collect();

    // Get distractor file paths - either semantic (Claude) or synthetic (fallback)
    let distractor_paths: Vec<String> = if let Some(lookup) = distractors {
        // Try to find semantic distractors for this case by seed file
        lookup
            .get(&case.seed_file)
            .cloned()
            .unwrap_or_else(|| generate_synthetic_distractors(case.expected_related.len()))
    } else {
        generate_synthetic_distractors(case.expected_related.len())
    };

    // Score and add distractors (all have ZERO coupling - they're noise)
    for distractor in distractor_paths {
        let score = score_file(point, &distractor, 0.0, &mut rng);
        scored.push((distractor, score, false));
    }

    // Sort by score descending
    scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

    // Return just filenames (ground truth + distractors mixed by score)
    scored.into_iter().map(|(f, _, _)| f).collect()
}

/// Generate synthetic distractor paths (fallback when no Claude-generated ones).
fn generate_synthetic_distractors(n_ground_truth: usize) -> Vec<String> {
    let n_distractors = (n_ground_truth * 3).max(10);
    (0..n_distractors)
        .map(|i| match i % 5 {
            0 => format!("src/distractor_{}.rs", i),         // depth 1
            1 => format!("src/utils/helper_{}.rs", i),       // depth 2
            2 => format!("src/core/internal/deep_{}.rs", i), // depth 3
            3 => format!("lib/mod_{}.rs", i),                // depth 1
            _ => format!("tests/test_{}.rs", i),             // depth 1
        })
        .collect()
}

/// Score a single file based on parameters.
///
/// The score combines:
/// - Base coupling weight (0.0 for distractors, >0 for ground truth)
/// - Depth weighting (shallower = higher by default)
/// - Simulated recency boost
/// - Random noise
fn score_file(
    point: &ParameterPoint,
    file: &str,
    coupling_weight: f64,
    rng: &mut impl rand::Rng,
) -> f64 {
    // Base score from coupling (this IS the signal)
    // Ground truth: coupling > 0
    // Distractors: coupling = 0
    let mut score = coupling_weight;

    // Depth weighting - shallower files get higher base weight
    // This is a heuristic that should help, but distractors at shallow
    // depths will also get boosted, testing if coupling dominates
    let depth = file.matches('/').count();
    let depth_mult = if depth <= 2 {
        point.depth_weight_root
    } else if depth <= 4 {
        point.depth_weight_moderate
    } else {
        point.depth_weight_deep
    };

    // For files with coupling > 0, depth acts as multiplier
    // For distractors (coupling = 0), depth gives them a small base score
    // This creates the challenge: can coupling beat depth heuristic?
    if coupling_weight > 0.0 {
        score *= depth_mult;
    } else {
        // Distractors get only depth-based score (0.01 - 0.1 range)
        // This simulates "PageRank gives some score to all files"
        score = 0.05 * depth_mult;
    }

    // Simulated recency boost
    // Ground truth files: assume 70% are "recent" (last 30 days)
    // Distractors: assume only 20% are "recent"
    let is_recent = if coupling_weight > 0.0 {
        rng.r#gen::<f64>() < 0.7
    } else {
        rng.r#gen::<f64>() < 0.2
    };

    if is_recent {
        // Recent files get boosted based on recency config
        let recency_boost =
            1.0 + (point.git_recency_max_boost - 1.0) * (1.0 - rng.r#gen::<f64>() * 0.3); // 70-100% of max boost
        score *= recency_boost;
    }

    // Simulated churn boost
    // Ground truth files: assume 40% are "high churn"
    // Distractors: assume only 10% are "high churn"
    let is_high_churn = if coupling_weight > 0.0 {
        rng.r#gen::<f64>() < 0.4
    } else {
        rng.r#gen::<f64>() < 0.1
    };

    if is_high_churn {
        score *= 1.0 + (point.git_churn_max_boost - 1.0) * 0.5;
    }

    // Random noise (±15%)
    score *= 0.85 + 0.3 * rng.r#gen::<f64>();

    score
}

/// Ensure curated repos are cloned locally.
fn ensure_repos_cloned(specs: &[&RepoSpec], base_dir: &Path) -> anyhow::Result<Vec<PathBuf>> {
    use indicatif::{ProgressBar, ProgressStyle};

    std::fs::create_dir_all(base_dir)?;

    let mut paths = Vec::new();

    for spec in specs {
        let repo_dir = base_dir.join(&spec.name);

        if !repo_dir.exists() {
            let spinner = ProgressBar::new_spinner();
            spinner.set_style(
                ProgressStyle::with_template("{spinner:.green} {msg}")
                    .unwrap()
                    .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]),
            );
            spinner.set_message(format!("Cloning {}...", spec.name));
            spinner.enable_steady_tick(std::time::Duration::from_millis(80));

            let status = std::process::Command::new("git")
                .args([
                    "clone",
                    "--depth",
                    &spec.estimated_commits().to_string(),
                    &spec.url,
                    repo_dir.to_str().unwrap(),
                ])
                .stdout(std::process::Stdio::null())
                .stderr(std::process::Stdio::null())
                .status()?;

            if !status.success() {
                spinner.finish_with_message(format!("✗ Failed to clone {}", spec.name));
                continue;
            }
            spinner.finish_with_message(format!("✓ Cloned {}", spec.name));
        }

        paths.push(repo_dir);
    }

    Ok(paths)
}

/// Print a parameter configuration in a readable format.
fn print_config(point: &ParameterPoint) {
    println!("PageRank:");
    println!("  alpha:           {:.2}", point.pagerank_alpha);
    println!("  chat_multiplier: {:.1}", point.pagerank_chat_multiplier);
    println!();
    println!("Depth Weights:");
    println!("  root:     {:.2}", point.depth_weight_root);
    println!("  moderate: {:.2}", point.depth_weight_moderate);
    println!("  deep:     {:.2}", point.depth_weight_deep);
    println!("  vendor:   {:.4}", point.depth_weight_vendor);
    println!();
    println!("Boosts:");
    println!("  mentioned_ident: {:.1}", point.boost_mentioned_ident);
    println!("  mentioned_file:  {:.1}", point.boost_mentioned_file);
    println!("  chat_file:       {:.1}", point.boost_chat_file);
    println!("  temporal:        {:.1}", point.boost_temporal_coupling);
    println!("  focus_expand:    {:.1}", point.boost_focus_expansion);
    println!();
    println!("Git:");
    println!("  recency_decay_days: {:.1}", point.git_recency_decay_days);
    println!("  recency_max_boost:  {:.1}", point.git_recency_max_boost);
    println!("  churn_threshold:    {:.0}", point.git_churn_threshold);
    println!("  churn_max_boost:    {:.1}", point.git_churn_max_boost);
    println!();
    println!("Focus Expansion:");
    println!("  decay:    {:.2}", point.focus_decay);
    println!("  max_hops: {:.0}", point.focus_max_hops);
}

// =============================================================================
// REASONING-BASED TRAINING
// =============================================================================

/// Run reasoning-based training loop.
///
/// This is where Claude acts as a universal function approximator:
/// - Evaluate current parameters, collect failures
/// - Ask Claude to reason about WHY failures occurred
/// - Apply proposed changes, update scratchpad
/// - Repeat for N episodes
fn run_reasoning_training(
    args: &Args,
    cases: &[WeightedCase],
    distractors: Option<&DistractorLookup>,
) -> anyhow::Result<()> {
    use ripmap::training::LiveProgress;

    // Load prompt template (required)
    let prompt_path = args.prompt.as_ref()
        .ok_or_else(|| anyhow::anyhow!("--prompt is required for reasoning mode. Example: --prompt training-outer/prompts/inner/v001.md"))?;
    let prompt_template = std::fs::read_to_string(prompt_path).map_err(|e| {
        anyhow::anyhow!(
            "Failed to read prompt template '{}': {}",
            prompt_path.display(),
            e
        )
    })?;

    // Parse agent type
    let agent: Agent = args.agent.parse().map_err(|e: String| anyhow::anyhow!(e))?;
    let model = args.model.as_deref();

    println!("\n=== REASONING-BASED TRAINING ===");
    println!("Prompt: {}", prompt_path.display());
    println!("Using {} as universal function approximator\n", agent);

    // Load or create scratchpad
    let mut scratchpad = if args.scratchpad.exists() {
        let file = File::open(&args.scratchpad)?;
        serde_json::from_reader(file).unwrap_or_default()
    } else {
        Scratchpad::default()
    };

    // Start with default or loaded parameters
    let mut current_params = if let Some(ref config_path) = args.config {
        let file = File::open(config_path)?;
        serde_json::from_reader(file)?
    } else {
        ParameterPoint::default()
    };

    println!("Starting parameters:");
    print_config(&current_params);
    println!("\nRunning {} reasoning episodes...\n", args.episodes);

    // Live progress visualization for terminal sparklines
    let mut progress = LiveProgress::new();

    for episode_num in 0..args.episodes {
        // Evaluate current parameters
        let metrics = evaluate_point(&current_params, cases, distractors);

        // Collect failures (cases where NDCG < threshold)
        let failures =
            collect_failures(&current_params, cases, distractors, args.failure_threshold);

        if failures.is_empty() {
            println!(
                "No failures below threshold {:.2}. Training converged!",
                args.failure_threshold
            );
            break;
        }

        // Ask LLM to reason about failures with spinner
        use indicatif::{ProgressBar, ProgressStyle};
        let spinner = ProgressBar::new_spinner();
        spinner.set_style(
            ProgressStyle::with_template("{prefix:.bold} {spinner:.cyan} {msg}")
                .unwrap()
                .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]),
        );
        spinner.set_prefix(format!("E{:02}", episode_num + 1));
        spinner.set_message(format!(
            "{} reasoning ({} failures)...",
            agent,
            failures.len()
        ));
        spinner.enable_steady_tick(std::time::Duration::from_millis(80));

        // Phase 1 agentic mode: pass run directory for file access
        // Extract directory from scratchpad path (e.g., "training/runs/my-run/scratchpad.json" -> "training/runs/my-run")
        let run_dir = args
            .scratchpad
            .parent()
            .and_then(|p| p.to_str())
            .map(|s| s.to_string());

        match reason_about_failures(
            &prompt_template,
            &failures,
            &current_params,
            &scratchpad,
            metrics.ndcg_at_10,
            agent,
            model,
            run_dir.as_deref(),
        ) {
            Ok(episode) => {
                spinner.finish_and_clear();

                // Record metrics for live progress visualization
                progress.record(
                    metrics.ndcg_at_10,
                    failures.len(),
                    episode.confidence,
                    current_params.pagerank_alpha,
                );

                // Display live sparklines during training
                progress.display(episode_num + 1, args.episodes);

                println!(); // Newline after sparkline for subsequent output
                println!(
                    "Confidence: {:.2}  ⏱ {:.1}s",
                    episode.confidence, episode.duration_secs
                );
                println!("Proposed {} changes:", episode.proposed_changes.len());
                for (param, (dir, mag, rationale)) in &episode.proposed_changes {
                    println!("  {} {} {} - \"{}\"", param, dir, mag, rationale);
                }

                if !episode.structural_insights.is_empty() {
                    println!("\nStructural insights:");
                    for insight in &episode.structural_insights {
                        println!("  • {}", insight);
                    }
                }

                // Apply changes if confidence is high enough
                if episode.confidence >= 0.3 {
                    current_params = apply_changes(&current_params, &episode.proposed_changes);
                    println!("\nApplied changes. New params:");
                    print_config(&current_params);
                } else {
                    println!(
                        "\nConfidence too low ({:.2}), skipping changes",
                        episode.confidence
                    );
                }

                // Update scratchpad
                update_scratchpad(&mut scratchpad, &episode);
            }
            Err(e) => {
                spinner.finish_with_message("failed");
                eprintln!("Warning: Reasoning failed: {}", e);
                // Still record progress even on failure
                progress.record(
                    metrics.ndcg_at_10,
                    failures.len(),
                    0.0,
                    current_params.pagerank_alpha,
                );
                progress.display(episode_num + 1, args.episodes);
                println!();
            }
        }

        // Incremental save for crash recovery (every save_interval episodes)
        if (episode_num + 1) % args.save_interval == 0 {
            // Save scratchpad
            std::fs::create_dir_all(args.scratchpad.parent().unwrap_or(Path::new(".")))?;
            let scratchpad_file = File::create(&args.scratchpad)?;
            serde_json::to_writer_pretty(scratchpad_file, &scratchpad)?;

            // Save current params to checkpoints dir if using run_name, otherwise alongside output
            let checkpoint_path = if args.run_name.is_some() {
                args.output
                    .parent()
                    .unwrap()
                    .join("checkpoints")
                    .join(format!("ep{:03}.json", episode_num + 1))
            } else {
                args.output
                    .with_extension(format!("ep{}.json", episode_num + 1))
            };
            let checkpoint_file = File::create(&checkpoint_path)?;
            serde_json::to_writer_pretty(checkpoint_file, &current_params)?;

            // Generate interim chart if plotters enabled
            #[cfg(feature = "plotters")]
            if let Some(plot_path) = &args.plot {
                use ripmap::training::plots::plot_training_progress;
                let _ = plot_training_progress(
                    &scratchpad,
                    plot_path.to_str().unwrap_or("training.png"),
                );
            }

            println!("  [checkpoint saved at episode {}]", episode_num + 1);
        }

        println!();
    }

    // Final evaluation
    let final_metrics = evaluate_point(&current_params, cases, distractors);

    // Display final training summary with full sparklines
    progress.final_summary();

    println!("Final NDCG@10: {:.4}", final_metrics.ndcg_at_10);
    println!("Final MRR:     {:.4}", final_metrics.mrr);

    // Timing statistics
    let durations: Vec<f64> = scratchpad
        .episodes
        .iter()
        .map(|e| e.duration_secs)
        .filter(|&d| d > 0.0) // Filter out old episodes without timing
        .collect();
    if !durations.is_empty() {
        let total_agent_time: f64 = durations.iter().sum();
        let avg_time = total_agent_time / durations.len() as f64;
        let min_time = durations.iter().cloned().fold(f64::INFINITY, f64::min);
        let max_time = durations.iter().cloned().fold(0.0, f64::max);
        println!(
            "\n⏱ Agent timing ({} episodes with timing data):",
            durations.len()
        );
        println!(
            "  Total: {:.1}s ({:.1}m)  Avg: {:.1}s  Min: {:.1}s  Max: {:.1}s",
            total_agent_time,
            total_agent_time / 60.0,
            avg_time,
            min_time,
            max_time
        );
    }

    // Save scratchpad
    std::fs::create_dir_all(args.scratchpad.parent().unwrap_or(Path::new(".")))?;
    let scratchpad_file = File::create(&args.scratchpad)?;
    serde_json::to_writer_pretty(scratchpad_file, &scratchpad)?;
    println!("\nScratchpad saved to {}", args.scratchpad.display());

    // Print scratchpad summary
    print_scratchpad_summary(&scratchpad);

    // Save final params
    let config_path = args.output.with_extension("trained.json");
    let config_file = File::create(&config_path)?;
    serde_json::to_writer_pretty(config_file, &current_params)?;
    println!("\nTrained config saved to {}", config_path.display());

    // Generate training progress chart if requested
    if let Some(plot_path) = &args.plot {
        #[cfg(feature = "plotters")]
        {
            use ripmap::training::plots::plot_training_progress;
            match plot_training_progress(&scratchpad, plot_path.to_str().unwrap_or("training.png"))
            {
                Ok(()) => println!("Training chart saved to {}", plot_path.display()),
                Err(e) => eprintln!("Failed to generate chart: {}", e),
            }
        }
        #[cfg(not(feature = "plotters"))]
        {
            eprintln!("Chart generation requires: cargo build --features plotters");
            let _ = plot_path; // suppress unused warning
        }
    }

    Ok(())
}

/// Collect ranking failures for reasoning analysis.
fn collect_failures(
    params: &ParameterPoint,
    cases: &[WeightedCase],
    distractors: Option<&DistractorLookup>,
    threshold: f64,
) -> Vec<RankingFailure> {
    let mut failures = Vec::new();

    for case in cases.iter().take(50) {
        // Sample up to 50 cases
        let ranking = simulate_ranking(params, case, distractors);

        // Ground truth as (file, weight) pairs for metrics computation
        let ground_truth_weighted: Vec<_> = case.expected_related.clone();

        // Ground truth file names only for failure reporting
        let ground_truth_files: Vec<_> = case
            .expected_related
            .iter()
            .map(|(f, _)| f.clone())
            .collect();

        let metrics = CaseMetrics::compute(&ranking, &ground_truth_weighted, 0.1);

        if metrics.ndcg_at_10 < threshold {
            failures.push(RankingFailure {
                query: case.seed_file.clone(), // Use seed as query
                seed_file: case.seed_file.clone(),
                expected_top: ground_truth_files.iter().take(5).cloned().collect(),
                actual_top: ranking.iter().take(5).cloned().collect(),
                ndcg: metrics.ndcg_at_10,
                commit_context: format!("intent: {:?}", case.inferred_intent),
                repo_name: "curated".to_string(),
                repo_file_count: 100, // Placeholder
            });

            if failures.len() >= 10 {
                break; // Cap at 10 failures per episode
            }
        }
    }

    failures
}

/// Run distillation to crystallize scratchpad into wisdom.
fn run_distillation(args: &Args) -> anyhow::Result<()> {
    // Parse agent type
    let agent: Agent = args.agent.parse().map_err(|e: String| anyhow::anyhow!(e))?;
    let model = args.model.as_deref();

    println!("\n=== DISTILLING SCRATCHPAD ===\n");

    if !args.scratchpad.exists() {
        eprintln!(
            "Error: Scratchpad not found at {}",
            args.scratchpad.display()
        );
        std::process::exit(1);
    }

    let scratchpad_file = File::open(&args.scratchpad)?;
    let scratchpad: Scratchpad = serde_json::from_reader(scratchpad_file)?;

    println!(
        "Loaded {} episodes from scratchpad",
        scratchpad.episodes.len()
    );
    print_scratchpad_summary(&scratchpad);

    // Phase 1 agentic mode: pass run directory for file access
    let run_dir = args
        .scratchpad
        .parent()
        .and_then(|p| p.to_str())
        .map(|s| s.to_string());

    println!("\nCalling {} for distillation...", agent);
    match distill_scratchpad(&scratchpad, agent, model, run_dir.as_deref()) {
        Ok(wisdom) => {
            println!("\n=== DISTILLED WISDOM ===\n");
            println!("{}", wisdom);

            // Save wisdom
            let wisdom_path = args.output.with_extension("wisdom.json");
            let wisdom_file = File::create(&wisdom_path)?;
            wisdom_file.sync_all()?;
            std::fs::write(&wisdom_path, &wisdom)?;
            println!("\nWisdom saved to {}", wisdom_path.display());
        }
        Err(e) => {
            eprintln!("Error: Distillation failed: {}", e);
        }
    }

    Ok(())
}