Skip to main content

batty_cli/team/
autoresearch.rs

1//! Autonomous evaluator-driven research missions.
2
3use std::fs::{self, OpenOptions};
4use std::io::{BufRead, BufReader, Write};
5use std::path::{Path, PathBuf};
6use std::time::Instant;
7
8use anyhow::{Context, Result, bail};
9use serde::{Deserialize, Serialize};
10
11use super::parity::ParityReport;
12
13const RESEARCH_DIR: &str = "research";
14const CURRENT_MISSION_FILE: &str = "current.json";
15const MISSION_STATE_FILE: &str = "mission.json";
16const LEDGER_FILE: &str = "ledger.jsonl";
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19#[serde(rename_all = "snake_case")]
20pub enum EvaluatorFormat {
21    Json,
22    ExitCode,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26#[serde(rename_all = "snake_case")]
27pub enum KeepPolicy {
28    PassOnly,
29    ScoreImprovement,
30    ParityImprovement,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
34pub struct EvaluationResult {
35    pub pass: bool,
36    pub score: Option<f64>,
37    pub parity_pct: Option<u32>,
38    pub exit_code: i32,
39    pub stdout: String,
40    pub stderr: String,
41    pub duration_secs: f64,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
45#[serde(rename_all = "snake_case")]
46pub enum ResearchDecision {
47    Baseline,
48    Keep,
49    Discard,
50    Error,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct ResearchMission {
55    pub id: String,
56    pub hypothesis: String,
57    pub evaluator_command: String,
58    pub evaluator_format: EvaluatorFormat,
59    pub keep_policy: KeepPolicy,
60    pub max_iterations: u32,
61    pub worktree_dir: PathBuf,
62    pub baseline: Option<EvaluationResult>,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct LedgerEntry {
67    pub iteration: u32,
68    pub decision: ResearchDecision,
69    pub evaluation: EvaluationResult,
70    pub commit: String,
71    pub timestamp: chrono::DateTime<chrono::Utc>,
72}
73
74#[derive(Debug, Clone)]
75pub struct ResearchLedger {
76    pub entries: Vec<LedgerEntry>,
77    pub path: PathBuf,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
81struct MissionPointer {
82    mission_id: String,
83}
84
85#[derive(Debug, Clone)]
86pub struct ResearchStatus {
87    pub mission: ResearchMission,
88    pub latest_entry: Option<LedgerEntry>,
89}
90
91impl ResearchLedger {
92    pub fn load(path: PathBuf) -> Result<Self> {
93        if !path.exists() {
94            return Ok(Self {
95                entries: Vec::new(),
96                path,
97            });
98        }
99
100        let file =
101            fs::File::open(&path).with_context(|| format!("failed to open {}", path.display()))?;
102        let reader = BufReader::new(file);
103        let mut entries = Vec::new();
104        for line in reader.lines() {
105            let line = line.with_context(|| format!("failed to read {}", path.display()))?;
106            let trimmed = line.trim();
107            if trimmed.is_empty() {
108                continue;
109            }
110            entries.push(
111                serde_json::from_str(trimmed)
112                    .with_context(|| format!("failed to parse {}", path.display()))?,
113            );
114        }
115        Ok(Self { entries, path })
116    }
117
118    pub fn record(&mut self, entry: LedgerEntry) -> Result<()> {
119        if let Some(parent) = self.path.parent() {
120            fs::create_dir_all(parent)
121                .with_context(|| format!("failed to create {}", parent.display()))?;
122        }
123        let mut file = OpenOptions::new()
124            .create(true)
125            .append(true)
126            .open(&self.path)
127            .with_context(|| format!("failed to open {}", self.path.display()))?;
128        serde_json::to_writer(&mut file, &entry)
129            .with_context(|| format!("failed to write {}", self.path.display()))?;
130        writeln!(file).with_context(|| format!("failed to write {}", self.path.display()))?;
131        self.entries.push(entry);
132        Ok(())
133    }
134
135    pub fn len(&self) -> usize {
136        self.entries.len()
137    }
138
139    pub fn is_empty(&self) -> bool {
140        self.entries.is_empty()
141    }
142
143    pub fn last_kept_commit(&self) -> Option<&str> {
144        self.entries
145            .iter()
146            .rev()
147            .find(|entry| {
148                matches!(
149                    entry.decision,
150                    ResearchDecision::Baseline | ResearchDecision::Keep
151                )
152            })
153            .map(|entry| entry.commit.as_str())
154    }
155
156    pub fn latest(&self) -> Option<&LedgerEntry> {
157        self.entries.last()
158    }
159}
160
161#[derive(Debug, Clone)]
162pub struct StartResearchOptions {
163    pub hypothesis: String,
164    pub evaluator_command: String,
165    pub evaluator_format: EvaluatorFormat,
166    pub keep_policy: KeepPolicy,
167    pub max_iterations: u32,
168    pub worktree_dir: PathBuf,
169}
170
171pub fn start_research(
172    project_root: &Path,
173    options: StartResearchOptions,
174) -> Result<ResearchMission> {
175    if !options.worktree_dir.exists() {
176        bail!(
177            "research worktree does not exist: {}",
178            options.worktree_dir.display()
179        );
180    }
181
182    let mission_id = mission_id(&options.hypothesis);
183    let mission_dir = mission_dir(project_root, &mission_id);
184    fs::create_dir_all(&mission_dir)
185        .with_context(|| format!("failed to create {}", mission_dir.display()))?;
186
187    let mut mission = ResearchMission {
188        id: mission_id.clone(),
189        hypothesis: options.hypothesis,
190        evaluator_command: options.evaluator_command,
191        evaluator_format: options.evaluator_format,
192        keep_policy: options.keep_policy,
193        max_iterations: options.max_iterations.max(1),
194        worktree_dir: options.worktree_dir,
195        baseline: None,
196    };
197
198    let baseline = run_evaluator(
199        &mission.evaluator_command,
200        &mission.worktree_dir,
201        &mission.evaluator_format,
202    )?;
203    mission.baseline = Some(baseline.clone());
204    let mut ledger = ResearchLedger::load(ledger_path(project_root, &mission.id))?;
205    ledger.record(LedgerEntry {
206        iteration: 0,
207        decision: ResearchDecision::Baseline,
208        evaluation: baseline,
209        commit: git_head(&mission.worktree_dir)?,
210        timestamp: chrono::Utc::now(),
211    })?;
212    save_mission(project_root, &mission)?;
213    set_current_mission(project_root, &mission.id)?;
214    Ok(mission)
215}
216
217pub fn run_research_iteration(
218    mission: &mut ResearchMission,
219    ledger: &mut ResearchLedger,
220) -> Result<ResearchDecision> {
221    let result = run_evaluator(
222        &mission.evaluator_command,
223        &mission.worktree_dir,
224        &mission.evaluator_format,
225    )?;
226
227    let decision = match mission.keep_policy {
228        KeepPolicy::PassOnly => {
229            if result.pass {
230                ResearchDecision::Keep
231            } else {
232                ResearchDecision::Discard
233            }
234        }
235        KeepPolicy::ScoreImprovement => {
236            let baseline_score = mission
237                .baseline
238                .as_ref()
239                .and_then(|baseline| baseline.score)
240                .unwrap_or(0.0);
241            if result.score.unwrap_or(0.0) > baseline_score {
242                ResearchDecision::Keep
243            } else {
244                ResearchDecision::Discard
245            }
246        }
247        KeepPolicy::ParityImprovement => {
248            let baseline_parity = mission
249                .baseline
250                .as_ref()
251                .and_then(|baseline| baseline.parity_pct)
252                .unwrap_or(0);
253            if result.parity_pct.unwrap_or(0) > baseline_parity {
254                ResearchDecision::Keep
255            } else {
256                ResearchDecision::Discard
257            }
258        }
259    };
260
261    match decision {
262        ResearchDecision::Keep => {
263            git_commit_all(
264                &mission.worktree_dir,
265                &format!("research: iteration {}", ledger.len()),
266            )?;
267            mission.baseline = Some(result.clone());
268        }
269        ResearchDecision::Discard => {
270            let Some(commit) = ledger.last_kept_commit() else {
271                bail!("cannot discard without a kept or baseline commit");
272            };
273            git_reset_hard(&mission.worktree_dir, commit)?;
274        }
275        ResearchDecision::Baseline | ResearchDecision::Error => {}
276    }
277
278    let commit = git_head(&mission.worktree_dir)?;
279    ledger.record(LedgerEntry {
280        iteration: ledger.len() as u32,
281        decision: decision.clone(),
282        evaluation: result,
283        commit,
284        timestamp: chrono::Utc::now(),
285    })?;
286
287    Ok(decision)
288}
289
290pub fn current_status(project_root: &Path) -> Result<Option<ResearchStatus>> {
291    let Some(mission) = load_current_mission(project_root)? else {
292        return Ok(None);
293    };
294    let ledger = ResearchLedger::load(ledger_path(project_root, &mission.id))?;
295    Ok(Some(ResearchStatus {
296        mission,
297        latest_entry: ledger.latest().cloned(),
298    }))
299}
300
301pub fn read_current_ledger(project_root: &Path) -> Result<Option<ResearchLedger>> {
302    let Some(mission) = load_current_mission(project_root)? else {
303        return Ok(None);
304    };
305    Ok(Some(ResearchLedger::load(ledger_path(
306        project_root,
307        &mission.id,
308    ))?))
309}
310
311pub fn stop_current_research(project_root: &Path) -> Result<Option<ResearchMission>> {
312    let mission = load_current_mission(project_root)?;
313    let path = current_mission_path(project_root);
314    if path.exists() {
315        fs::remove_file(&path).with_context(|| format!("failed to remove {}", path.display()))?;
316    }
317    Ok(mission)
318}
319
320pub fn print_status(project_root: &Path) -> Result<()> {
321    let Some(status) = current_status(project_root)? else {
322        println!("No active research mission.");
323        return Ok(());
324    };
325    println!("Mission: {}", status.mission.id);
326    println!("Hypothesis: {}", status.mission.hypothesis);
327    println!("Worktree: {}", status.mission.worktree_dir.display());
328    println!(
329        "Keep policy: {}",
330        keep_policy_name(&status.mission.keep_policy)
331    );
332    println!(
333        "Baseline: {}",
334        status
335            .mission
336            .baseline
337            .as_ref()
338            .map(summary_line)
339            .unwrap_or_else(|| "none".to_string())
340    );
341    if let Some(entry) = status.latest_entry {
342        println!(
343            "Latest: iteration={} decision={} commit={} {}",
344            entry.iteration,
345            decision_name(&entry.decision),
346            entry.commit,
347            summary_line(&entry.evaluation)
348        );
349    }
350    Ok(())
351}
352
353pub fn print_ledger(project_root: &Path) -> Result<()> {
354    let Some(ledger) = read_current_ledger(project_root)? else {
355        println!("No active research mission.");
356        return Ok(());
357    };
358    println!("iteration  commit   pass  score  parity  decision");
359    for entry in ledger.entries {
360        println!(
361            "{:<10} {:<8} {:<5} {:<6} {:<7} {}",
362            entry.iteration,
363            shorten_commit(&entry.commit),
364            entry.evaluation.pass,
365            entry
366                .evaluation
367                .score
368                .map(|score| format!("{score:.2}"))
369                .unwrap_or_else(|| "-".to_string()),
370            entry
371                .evaluation
372                .parity_pct
373                .map(|pct| format!("{pct}%"))
374                .unwrap_or_else(|| "-".to_string()),
375            decision_name(&entry.decision),
376        );
377    }
378    Ok(())
379}
380
381fn summary_line(result: &EvaluationResult) -> String {
382    format!(
383        "pass={} score={} parity={} exit={}",
384        result.pass,
385        result
386            .score
387            .map(|score| format!("{score:.2}"))
388            .unwrap_or_else(|| "-".to_string()),
389        result
390            .parity_pct
391            .map(|pct| format!("{pct}%"))
392            .unwrap_or_else(|| "-".to_string()),
393        result.exit_code
394    )
395}
396
397fn run_evaluator(
398    command: &str,
399    worktree_dir: &Path,
400    format: &EvaluatorFormat,
401) -> Result<EvaluationResult> {
402    let started = Instant::now();
403    let output = std::process::Command::new("sh")
404        .args(["-lc", command])
405        .current_dir(worktree_dir)
406        .output()
407        .with_context(|| {
408            format!(
409                "failed to execute evaluator `{command}` in {}",
410                worktree_dir.display()
411            )
412        })?;
413    let duration_secs = started.elapsed().as_secs_f64();
414    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
415    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
416    let exit_code = output.status.code().unwrap_or(-1);
417
418    match format {
419        EvaluatorFormat::Json => {
420            #[derive(Deserialize)]
421            struct JsonEvaluation {
422                pass: Option<bool>,
423                score: Option<f64>,
424                parity_pct: Option<u32>,
425            }
426
427            let parsed: JsonEvaluation = serde_json::from_str(stdout.trim())
428                .with_context(|| format!("failed to parse evaluator JSON from `{command}`"))?;
429            Ok(EvaluationResult {
430                pass: parsed.pass.unwrap_or(output.status.success()),
431                score: parsed.score,
432                parity_pct: parsed
433                    .parity_pct
434                    .or_else(|| current_parity_pct(worktree_dir)),
435                exit_code,
436                stdout,
437                stderr,
438                duration_secs,
439            })
440        }
441        EvaluatorFormat::ExitCode => Ok(EvaluationResult {
442            pass: output.status.success(),
443            score: None,
444            parity_pct: current_parity_pct(worktree_dir),
445            exit_code,
446            stdout,
447            stderr,
448            duration_secs,
449        }),
450    }
451}
452
453fn current_parity_pct(project_root: &Path) -> Option<u32> {
454    ParityReport::load(project_root)
455        .ok()
456        .map(|report| report.summary().overall_parity_pct as u32)
457}
458
459fn git_head(worktree_dir: &Path) -> Result<String> {
460    let output = std::process::Command::new("git")
461        .args(["rev-parse", "HEAD"])
462        .current_dir(worktree_dir)
463        .output()
464        .with_context(|| format!("failed to read HEAD in {}", worktree_dir.display()))?;
465    if !output.status.success() {
466        bail!("failed to read HEAD in {}", worktree_dir.display());
467    }
468    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
469}
470
471fn git_commit_all(worktree_dir: &Path, message: &str) -> Result<()> {
472    let add = std::process::Command::new("git")
473        .args(["add", "-A"])
474        .current_dir(worktree_dir)
475        .status()
476        .with_context(|| format!("failed to stage worktree {}", worktree_dir.display()))?;
477    if !add.success() {
478        bail!("failed to stage worktree {}", worktree_dir.display());
479    }
480
481    let commit = std::process::Command::new("git")
482        .args(["commit", "-m", message])
483        .current_dir(worktree_dir)
484        .status()
485        .with_context(|| format!("failed to commit worktree {}", worktree_dir.display()))?;
486    if !commit.success() {
487        bail!("failed to commit worktree {}", worktree_dir.display());
488    }
489    Ok(())
490}
491
492fn git_reset_hard(worktree_dir: &Path, target: &str) -> Result<()> {
493    let status = std::process::Command::new("git")
494        .args(["reset", "--hard", target])
495        .current_dir(worktree_dir)
496        .status()
497        .with_context(|| format!("failed to reset {}", worktree_dir.display()))?;
498    if !status.success() {
499        bail!("failed to reset {} to {}", worktree_dir.display(), target);
500    }
501    Ok(())
502}
503
504fn mission_dir(project_root: &Path, mission_id: &str) -> PathBuf {
505    project_root
506        .join(".batty")
507        .join(RESEARCH_DIR)
508        .join(mission_id)
509}
510
511fn ledger_path(project_root: &Path, mission_id: &str) -> PathBuf {
512    mission_dir(project_root, mission_id).join(LEDGER_FILE)
513}
514
515fn mission_state_path(project_root: &Path, mission_id: &str) -> PathBuf {
516    mission_dir(project_root, mission_id).join(MISSION_STATE_FILE)
517}
518
519fn current_mission_path(project_root: &Path) -> PathBuf {
520    project_root
521        .join(".batty")
522        .join(RESEARCH_DIR)
523        .join(CURRENT_MISSION_FILE)
524}
525
526fn save_mission(project_root: &Path, mission: &ResearchMission) -> Result<()> {
527    let path = mission_state_path(project_root, &mission.id);
528    if let Some(parent) = path.parent() {
529        fs::create_dir_all(parent)
530            .with_context(|| format!("failed to create {}", parent.display()))?;
531    }
532    let content = serde_json::to_vec_pretty(mission)
533        .with_context(|| format!("failed to serialize {}", mission.id))?;
534    fs::write(&path, content).with_context(|| format!("failed to write {}", path.display()))?;
535    Ok(())
536}
537
538fn load_current_mission(project_root: &Path) -> Result<Option<ResearchMission>> {
539    let current = current_mission_path(project_root);
540    if !current.exists() {
541        return Ok(None);
542    }
543    let pointer: MissionPointer = serde_json::from_slice(
544        &fs::read(&current).with_context(|| format!("failed to read {}", current.display()))?,
545    )
546    .with_context(|| format!("failed to parse {}", current.display()))?;
547    let state_path = mission_state_path(project_root, &pointer.mission_id);
548    let mission = serde_json::from_slice(
549        &fs::read(&state_path)
550            .with_context(|| format!("failed to read {}", state_path.display()))?,
551    )
552    .with_context(|| format!("failed to parse {}", state_path.display()))?;
553    Ok(Some(mission))
554}
555
556fn set_current_mission(project_root: &Path, mission_id: &str) -> Result<()> {
557    let path = current_mission_path(project_root);
558    if let Some(parent) = path.parent() {
559        fs::create_dir_all(parent)
560            .with_context(|| format!("failed to create {}", parent.display()))?;
561    }
562    let content = serde_json::to_vec_pretty(&MissionPointer {
563        mission_id: mission_id.to_string(),
564    })?;
565    fs::write(&path, content).with_context(|| format!("failed to write {}", path.display()))?;
566    Ok(())
567}
568
569fn mission_id(hypothesis: &str) -> String {
570    let slug: String = hypothesis
571        .chars()
572        .map(|ch| {
573            if ch.is_ascii_alphanumeric() {
574                ch.to_ascii_lowercase()
575            } else {
576                '-'
577            }
578        })
579        .collect();
580    let compact = slug
581        .split('-')
582        .filter(|part| !part.is_empty())
583        .take(6)
584        .collect::<Vec<_>>()
585        .join("-");
586    format!("{}-{}", compact, chrono::Utc::now().timestamp())
587}
588
589fn shorten_commit(commit: &str) -> String {
590    commit.chars().take(7).collect()
591}
592
593fn decision_name(decision: &ResearchDecision) -> &'static str {
594    match decision {
595        ResearchDecision::Baseline => "baseline",
596        ResearchDecision::Keep => "keep",
597        ResearchDecision::Discard => "discard",
598        ResearchDecision::Error => "error",
599    }
600}
601
602fn keep_policy_name(policy: &KeepPolicy) -> &'static str {
603    match policy {
604        KeepPolicy::PassOnly => "pass-only",
605        KeepPolicy::ScoreImprovement => "score-improvement",
606        KeepPolicy::ParityImprovement => "parity-improvement",
607    }
608}
609
610#[cfg(test)]
611mod tests {
612    use super::*;
613
614    fn git(dir: &Path, args: &[&str]) {
615        let status = std::process::Command::new("git")
616            .args(args)
617            .current_dir(dir)
618            .status()
619            .unwrap();
620        assert!(status.success(), "git {:?} failed", args);
621    }
622
623    fn repo_with_file() -> tempfile::TempDir {
624        let tmp = tempfile::tempdir().unwrap();
625        git(tmp.path(), &["init"]);
626        git(tmp.path(), &["config", "user.email", "test@example.com"]);
627        git(tmp.path(), &["config", "user.name", "Test User"]);
628        fs::write(tmp.path().join("note.txt"), "baseline\n").unwrap();
629        git(tmp.path(), &["add", "note.txt"]);
630        git(tmp.path(), &["commit", "-m", "baseline"]);
631        tmp
632    }
633
634    fn baseline_result(
635        score: Option<f64>,
636        parity_pct: Option<u32>,
637        pass: bool,
638    ) -> EvaluationResult {
639        EvaluationResult {
640            pass,
641            score,
642            parity_pct,
643            exit_code: if pass { 0 } else { 1 },
644            stdout: String::new(),
645            stderr: String::new(),
646            duration_secs: 0.0,
647        }
648    }
649
650    #[test]
651    fn pass_only_policy_discards_failures() {
652        let tmp = repo_with_file();
653        let baseline_commit = git_head(tmp.path()).unwrap();
654        fs::write(tmp.path().join("note.txt"), "candidate\n").unwrap();
655        let mut mission = ResearchMission {
656            id: "mission".to_string(),
657            hypothesis: "pass only".to_string(),
658            evaluator_command: "printf '{\"pass\":false}' && exit 1".to_string(),
659            evaluator_format: EvaluatorFormat::Json,
660            keep_policy: KeepPolicy::PassOnly,
661            max_iterations: 3,
662            worktree_dir: tmp.path().to_path_buf(),
663            baseline: Some(baseline_result(None, None, true)),
664        };
665        let mut ledger = ResearchLedger {
666            entries: vec![LedgerEntry {
667                iteration: 0,
668                decision: ResearchDecision::Baseline,
669                evaluation: baseline_result(None, None, true),
670                commit: baseline_commit.clone(),
671                timestamp: chrono::Utc::now(),
672            }],
673            path: tmp.path().join("ledger.jsonl"),
674        };
675
676        let decision = run_research_iteration(&mut mission, &mut ledger).unwrap();
677        assert_eq!(decision, ResearchDecision::Discard);
678        assert_eq!(git_head(tmp.path()).unwrap(), baseline_commit);
679    }
680
681    #[test]
682    fn start_research_records_baseline_entry() {
683        let root = tempfile::tempdir().unwrap();
684        let worktree = repo_with_file();
685
686        let mission = start_research(
687            root.path(),
688            StartResearchOptions {
689                hypothesis: "record baseline".to_string(),
690                evaluator_command: "printf '{\"pass\":true,\"score\":1.0}'".to_string(),
691                evaluator_format: EvaluatorFormat::Json,
692                keep_policy: KeepPolicy::ScoreImprovement,
693                max_iterations: 10,
694                worktree_dir: worktree.path().to_path_buf(),
695            },
696        )
697        .unwrap();
698
699        let current = current_status(root.path()).unwrap().unwrap();
700        assert_eq!(current.mission.id, mission.id);
701        assert_eq!(
702            current
703                .latest_entry
704                .as_ref()
705                .map(|entry| entry.decision.clone()),
706            Some(ResearchDecision::Baseline)
707        );
708
709        let ledger = read_current_ledger(root.path()).unwrap().unwrap();
710        assert_eq!(ledger.entries.len(), 1);
711        assert_eq!(ledger.entries[0].evaluation.score, Some(1.0));
712    }
713
714    #[test]
715    fn score_improvement_keeps_and_commits() {
716        let tmp = repo_with_file();
717        fs::write(tmp.path().join("note.txt"), "candidate\n").unwrap();
718        let mut mission = ResearchMission {
719            id: "mission".to_string(),
720            hypothesis: "improve score".to_string(),
721            evaluator_command: "printf '{\"pass\":true,\"score\":2.0}'".to_string(),
722            evaluator_format: EvaluatorFormat::Json,
723            keep_policy: KeepPolicy::ScoreImprovement,
724            max_iterations: 3,
725            worktree_dir: tmp.path().to_path_buf(),
726            baseline: Some(baseline_result(Some(1.0), None, true)),
727        };
728        let mut ledger = ResearchLedger {
729            entries: vec![LedgerEntry {
730                iteration: 0,
731                decision: ResearchDecision::Baseline,
732                evaluation: baseline_result(Some(1.0), None, true),
733                commit: git_head(tmp.path()).unwrap(),
734                timestamp: chrono::Utc::now(),
735            }],
736            path: tmp.path().join("ledger.jsonl"),
737        };
738
739        let decision = run_research_iteration(&mut mission, &mut ledger).unwrap();
740        assert_eq!(decision, ResearchDecision::Keep);
741        assert_eq!(
742            mission.baseline.as_ref().and_then(|result| result.score),
743            Some(2.0)
744        );
745        assert_eq!(ledger.entries.len(), 2);
746    }
747
748    #[test]
749    fn discard_resets_to_last_kept_commit() {
750        let tmp = repo_with_file();
751        let baseline_commit = git_head(tmp.path()).unwrap();
752        fs::write(tmp.path().join("note.txt"), "bad candidate\n").unwrap();
753        let mut mission = ResearchMission {
754            id: "mission".to_string(),
755            hypothesis: "avoid regression".to_string(),
756            evaluator_command: "printf '{\"pass\":true,\"score\":1.0}'".to_string(),
757            evaluator_format: EvaluatorFormat::Json,
758            keep_policy: KeepPolicy::ScoreImprovement,
759            max_iterations: 3,
760            worktree_dir: tmp.path().to_path_buf(),
761            baseline: Some(baseline_result(Some(2.0), None, true)),
762        };
763        let mut ledger = ResearchLedger {
764            entries: vec![LedgerEntry {
765                iteration: 0,
766                decision: ResearchDecision::Baseline,
767                evaluation: baseline_result(Some(2.0), None, true),
768                commit: baseline_commit.clone(),
769                timestamp: chrono::Utc::now(),
770            }],
771            path: tmp.path().join("ledger.jsonl"),
772        };
773
774        let decision = run_research_iteration(&mut mission, &mut ledger).unwrap();
775        assert_eq!(decision, ResearchDecision::Discard);
776        assert_eq!(
777            fs::read_to_string(tmp.path().join("note.txt")).unwrap(),
778            "baseline\n"
779        );
780        assert_eq!(git_head(tmp.path()).unwrap(), baseline_commit);
781    }
782
783    #[test]
784    fn parity_improvement_uses_parity_report() {
785        let tmp = repo_with_file();
786        fs::write(
787            tmp.path().join("PARITY.md"),
788            concat!(
789                "---\n",
790                "project: trivial\n",
791                "target: trivial.z80\n",
792                "source_platform: zx-spectrum-z80\n",
793                "target_language: rust\n",
794                "last_verified: 2026-04-06\n",
795                "overall_parity: 50%\n",
796                "---\n\n",
797                "| Behavior | Spec | Test | Implementation | Verified | Notes |\n",
798                "| --- | --- | --- | --- | --- | --- |\n",
799                "| Startup | complete | complete | complete | PASS | ok |\n",
800                "| Errors | complete | complete | draft | -- | pending |\n",
801            ),
802        )
803        .unwrap();
804
805        let result = run_evaluator("true", tmp.path(), &EvaluatorFormat::ExitCode).unwrap();
806        assert_eq!(result.parity_pct, Some(50));
807    }
808}