Skip to main content

truth_mirror/
reviewer.rs

1//! Reviewer process harness, model opposition, async queue, and verdict execution.
2
3use std::{
4    fs,
5    io::{self, Write},
6    path::{Path, PathBuf},
7    process::{Command, ExitCode, Stdio},
8    time::{SystemTime, UNIX_EPOCH},
9};
10
11use anyhow::{Context, Result};
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15use crate::{
16    claim::{Claim, EvidenceRef},
17    cli::{self, Agent, ReviewerHarness},
18    config,
19    ledger::{LedgerEntry, LedgerStore, ReviewerConfig, Verdict},
20};
21
22pub const REVIEW_QUEUE_FILE: &str = "review-queue.jsonl";
23
24#[derive(Clone, Debug, Eq, PartialEq)]
25pub struct ReviewRequest {
26    pub watched_agent: Agent,
27    pub watched_model: String,
28    pub reviewer_harness: ReviewerHarness,
29    pub reviewer_model: String,
30    pub allow_same_model: bool,
31    pub prompt: String,
32}
33
34impl ReviewRequest {
35    pub fn new(
36        watched_agent: Agent,
37        watched_model: impl Into<String>,
38        reviewer_harness: ReviewerHarness,
39        reviewer_model: impl Into<String>,
40        allow_same_model: bool,
41        prompt: impl Into<String>,
42    ) -> Self {
43        Self {
44            watched_agent,
45            watched_model: watched_model.into(),
46            reviewer_harness,
47            reviewer_model: reviewer_model.into(),
48            allow_same_model,
49            prompt: prompt.into(),
50        }
51    }
52}
53
54/// Resolved reviewer selection shared by `review` and `watch`. Explicit CLI
55/// values win; anything omitted falls back to `[review]` config.
56#[derive(Clone, Debug, Eq, PartialEq)]
57pub struct ReviewSelection {
58    pub watched_agent: Agent,
59    pub watched_model: String,
60    pub reviewer_harness: ReviewerHarness,
61    pub reviewer_model: String,
62    pub allow_same_model: bool,
63    pub strict: Option<StrictReviewConfig>,
64}
65
66impl ReviewSelection {
67    #[allow(clippy::too_many_arguments)]
68    pub fn resolve(
69        watched_agent: Option<Agent>,
70        reviewer_harness: Option<ReviewerHarness>,
71        watched_model: Option<String>,
72        reviewer_model: Option<String>,
73        allow_same_model: bool,
74        strict: Option<StrictReviewConfig>,
75        config: &config::TruthMirrorConfig,
76    ) -> Result<Self, ReviewerError> {
77        let watched_agent = match watched_agent {
78            Some(agent) => agent,
79            None => agent_from_slug(&config.review.watched.harness)?,
80        };
81        let reviewer_harness = match reviewer_harness {
82            Some(harness) => harness,
83            None => harness_from_slug(&config.review.reviewer.harness)?,
84        };
85        let watched_model = watched_model.unwrap_or_else(|| config.review.watched.model.clone());
86        let reviewer_model = reviewer_model.unwrap_or_else(|| config.review.reviewer.model.clone());
87        Ok(Self {
88            watched_agent,
89            watched_model,
90            reviewer_harness,
91            reviewer_model,
92            allow_same_model,
93            strict,
94        })
95    }
96
97    fn request_for(&self, prompt: String) -> ReviewRequest {
98        ReviewRequest::new(
99            self.watched_agent,
100            self.watched_model.clone(),
101            self.reviewer_harness,
102            self.reviewer_model.clone(),
103            self.allow_same_model,
104            prompt,
105        )
106    }
107}
108
109#[derive(Clone, Debug, Eq, PartialEq)]
110pub struct ReviewPlan {
111    pub watched_agent: Agent,
112    pub watched_model: String,
113    pub reviewer_harness: ReviewerHarness,
114    pub reviewer_model: String,
115    pub allow_same_model: bool,
116    pub invocation: InvocationPlan,
117}
118
119impl ReviewPlan {
120    pub fn build(request: ReviewRequest) -> Result<Self, ReviewerError> {
121        validate_model_present("watched", &request.watched_model)?;
122        validate_model_present("reviewer", &request.reviewer_model)?;
123
124        if !request.allow_same_model
125            && normalized_model(&request.watched_model) == normalized_model(&request.reviewer_model)
126        {
127            return Err(ReviewerError::SameModelWithoutWaiver {
128                watched_model: request.watched_model,
129                reviewer_model: request.reviewer_model,
130            });
131        }
132
133        let invocation =
134            InvocationPlan::for_harness(request.reviewer_harness, &request.reviewer_model)?;
135
136        Ok(Self {
137            watched_agent: request.watched_agent,
138            watched_model: request.watched_model,
139            reviewer_harness: request.reviewer_harness,
140            reviewer_model: request.reviewer_model,
141            allow_same_model: request.allow_same_model,
142            invocation,
143        })
144    }
145
146    pub fn run_with<R: ProcessRunner>(
147        &self,
148        prompt: &str,
149        runner: &R,
150    ) -> Result<ProcessOutput, ReviewerError> {
151        runner.run(&self.invocation, prompt)
152    }
153
154    fn reviewer_config(&self) -> ReviewerConfig {
155        ReviewerConfig::new(
156            harness_slug(self.reviewer_harness),
157            self.reviewer_model.clone(),
158            self.allow_same_model,
159        )
160    }
161}
162
163#[derive(Clone, Debug, Eq, PartialEq)]
164pub struct InvocationPlan {
165    pub program: String,
166    pub args: Vec<String>,
167    pub prompt_delivery: PromptDelivery,
168}
169
170impl InvocationPlan {
171    pub fn for_harness(harness: ReviewerHarness, model: &str) -> Result<Self, ReviewerError> {
172        validate_model_present("reviewer", model)?;
173        let model = model.trim();
174
175        let plan = match harness {
176            ReviewerHarness::Claude => Self {
177                program: "claude".to_owned(),
178                args: vec!["--print".to_owned(), "--model".to_owned(), model.to_owned()],
179                prompt_delivery: PromptDelivery::Stdin,
180            },
181            ReviewerHarness::Codex => Self {
182                program: "codex".to_owned(),
183                args: vec!["exec".to_owned(), "-m".to_owned(), model.to_owned()],
184                prompt_delivery: PromptDelivery::PositionalArgument,
185            },
186            ReviewerHarness::Pi => Self {
187                program: "pi".to_owned(),
188                args: vec!["--model".to_owned(), model.to_owned(), "-p".to_owned()],
189                prompt_delivery: PromptDelivery::Stdin,
190            },
191            ReviewerHarness::Gemini => Self {
192                program: "gemini".to_owned(),
193                args: vec!["-m".to_owned(), model.to_owned()],
194                prompt_delivery: PromptDelivery::FlagValue("-p".to_owned()),
195            },
196            ReviewerHarness::Opencode => Self {
197                program: "opencode".to_owned(),
198                args: vec!["run".to_owned(), "--model".to_owned(), model.to_owned()],
199                prompt_delivery: PromptDelivery::PositionalArgument,
200            },
201            ReviewerHarness::Custom => return Err(ReviewerError::UnsupportedCustomHarness),
202        };
203
204        Ok(plan)
205    }
206
207    pub fn args_for_prompt(&self, prompt: &str) -> Vec<String> {
208        let mut args = self.args.clone();
209        match &self.prompt_delivery {
210            PromptDelivery::Stdin => {}
211            PromptDelivery::PositionalArgument => args.push(prompt.to_owned()),
212            PromptDelivery::FlagValue(flag) => {
213                args.push(flag.clone());
214                args.push(prompt.to_owned());
215            }
216        }
217        args
218    }
219}
220
221#[derive(Clone, Debug, Eq, PartialEq)]
222pub enum PromptDelivery {
223    Stdin,
224    PositionalArgument,
225    FlagValue(String),
226}
227
228#[derive(Clone, Debug, Eq, PartialEq)]
229pub struct ProcessOutput {
230    pub status_code: Option<i32>,
231    pub stdout: String,
232    pub stderr: String,
233}
234
235pub trait ProcessRunner {
236    fn run(
237        &self,
238        invocation: &InvocationPlan,
239        prompt: &str,
240    ) -> Result<ProcessOutput, ReviewerError>;
241}
242
243#[derive(Clone, Copy, Debug, Default)]
244pub struct StdProcessRunner;
245
246impl ProcessRunner for StdProcessRunner {
247    fn run(
248        &self,
249        invocation: &InvocationPlan,
250        prompt: &str,
251    ) -> Result<ProcessOutput, ReviewerError> {
252        let mut command = Command::new(&invocation.program);
253        command.args(invocation.args_for_prompt(prompt));
254        command.stdout(Stdio::piped()).stderr(Stdio::piped());
255
256        if invocation.prompt_delivery == PromptDelivery::Stdin {
257            command.stdin(Stdio::piped());
258        }
259
260        let mut child = command.spawn().map_err(ReviewerError::Spawn)?;
261        if invocation.prompt_delivery == PromptDelivery::Stdin {
262            let mut stdin = child.stdin.take().ok_or(ReviewerError::MissingStdinPipe)?;
263            stdin
264                .write_all(prompt.as_bytes())
265                .map_err(ReviewerError::WritePrompt)?;
266        }
267
268        let output = child.wait_with_output().map_err(ReviewerError::Wait)?;
269        Ok(ProcessOutput {
270            status_code: output.status.code(),
271            stdout: String::from_utf8_lossy(&output.stdout).into_owned(),
272            stderr: String::from_utf8_lossy(&output.stderr).into_owned(),
273        })
274    }
275}
276
277#[derive(Clone, Debug, Eq, PartialEq)]
278pub struct ReviewJob {
279    pub commit_sha: String,
280    pub claim: Claim,
281    pub diff: String,
282    pub request: ReviewRequest,
283    pub strict: Option<StrictReviewConfig>,
284}
285
286#[derive(Clone, Debug, Eq, PartialEq)]
287pub struct StrictReviewConfig {
288    pub arbiter_harness: ReviewerHarness,
289    pub arbiter_model: String,
290}
291
292#[derive(Clone, Debug, Eq, PartialEq)]
293pub struct ReviewExecution {
294    pub entries: Vec<LedgerEntry>,
295}
296
297pub fn execute_review_job<R: ProcessRunner>(
298    job: ReviewJob,
299    runner: &R,
300    store: &LedgerStore,
301) -> Result<ReviewExecution, ReviewerError> {
302    let first_plan = ReviewPlan::build(job.request.clone())?;
303    let first_output = first_plan.run_with(&job.request.prompt, runner)?;
304    ensure_process_success(&first_output)?;
305    let first_verdict = ParsedVerdict::parse(&first_output.stdout)?;
306    let first_entry = entry_from_verdict(&job, &first_plan, &first_verdict);
307    store.append_entry(&first_entry)?;
308
309    let mut entries = vec![first_entry];
310    if let Some(strict) = &job.strict
311        && first_verdict.verdict == Verdict::Pass
312        && first_verdict.findings.is_empty()
313    {
314        validate_strict_arbiter(&job.request, strict)?;
315        let strict_prompt = strict_second_pass_prompt(&job, &first_output.stdout);
316        let strict_request = ReviewRequest::new(
317            job.request.watched_agent,
318            job.request.watched_model.clone(),
319            strict.arbiter_harness,
320            strict.arbiter_model.clone(),
321            false,
322            strict_prompt,
323        );
324        let strict_plan = ReviewPlan::build(strict_request.clone())?;
325        let strict_output = strict_plan.run_with(&strict_request.prompt, runner)?;
326        ensure_process_success(&strict_output)?;
327        let strict_verdict = ParsedVerdict::parse(&strict_output.stdout)?;
328        let strict_entry = entry_from_verdict(&job, &strict_plan, &strict_verdict);
329        store.append_entry(&strict_entry)?;
330        entries.push(strict_entry);
331    }
332
333    Ok(ReviewExecution { entries })
334}
335
336#[derive(Clone, Debug, Eq, PartialEq)]
337pub struct ParsedVerdict {
338    pub verdict: Verdict,
339    pub findings: Vec<String>,
340    pub raw: String,
341}
342
343impl ParsedVerdict {
344    pub fn parse(output: &str) -> Result<Self, ReviewerError> {
345        let verdict = output.lines().find_map(parse_verdict_line).ok_or_else(|| {
346            ReviewerError::VerdictParse {
347                output: output.to_owned(),
348            }
349        })?;
350        let findings = parse_findings(output);
351
352        Ok(Self {
353            verdict,
354            findings,
355            raw: output.to_owned(),
356        })
357    }
358}
359
360#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
361pub struct QueuedReview {
362    pub commit_sha: String,
363    pub enqueued_at_unix: u64,
364}
365
366#[derive(Clone, Debug)]
367pub struct ReviewQueue {
368    root: PathBuf,
369}
370
371impl ReviewQueue {
372    pub fn new(root: impl Into<PathBuf>) -> Self {
373        Self { root: root.into() }
374    }
375
376    pub fn path(&self) -> PathBuf {
377        self.root.join(REVIEW_QUEUE_FILE)
378    }
379
380    pub fn enqueue(&self, commit_sha: impl Into<String>) -> Result<QueuedReview, ReviewerError> {
381        fs::create_dir_all(&self.root).map_err(ReviewerError::QueueIo)?;
382        let item = QueuedReview {
383            commit_sha: commit_sha.into(),
384            enqueued_at_unix: unix_now(),
385        };
386        let mut file = fs::OpenOptions::new()
387            .create(true)
388            .append(true)
389            .open(self.path())
390            .map_err(ReviewerError::QueueIo)?;
391        serde_json::to_writer(&mut file, &item).map_err(ReviewerError::QueueJson)?;
392        writeln!(file).map_err(ReviewerError::QueueIo)?;
393        Ok(item)
394    }
395
396    pub fn pending(&self) -> Result<Vec<QueuedReview>, ReviewerError> {
397        let contents = match fs::read_to_string(self.path()) {
398            Ok(contents) => contents,
399            Err(error) if error.kind() == io::ErrorKind::NotFound => return Ok(Vec::new()),
400            Err(error) => return Err(ReviewerError::QueueIo(error)),
401        };
402
403        contents
404            .lines()
405            .filter(|line| !line.trim().is_empty())
406            .map(|line| serde_json::from_str(line).map_err(ReviewerError::QueueJson))
407            .collect()
408    }
409
410    /// Drop every queued item for `sha`, preserving any items appended for other
411    /// commits. Called after a commit is reviewed so a drain never repeats work.
412    pub fn remove_sha(&self, sha: &str) -> Result<(), ReviewerError> {
413        let remaining: Vec<QueuedReview> = self
414            .pending()?
415            .into_iter()
416            .filter(|item| item.commit_sha != sha)
417            .collect();
418        self.rewrite(&remaining)
419    }
420
421    fn rewrite(&self, items: &[QueuedReview]) -> Result<(), ReviewerError> {
422        if items.is_empty() {
423            return match fs::remove_file(self.path()) {
424                Ok(()) => Ok(()),
425                Err(error) if error.kind() == io::ErrorKind::NotFound => Ok(()),
426                Err(error) => Err(ReviewerError::QueueIo(error)),
427            };
428        }
429
430        let mut file = fs::File::create(self.path()).map_err(ReviewerError::QueueIo)?;
431        for item in items {
432            serde_json::to_writer(&mut file, item).map_err(ReviewerError::QueueJson)?;
433            writeln!(file).map_err(ReviewerError::QueueIo)?;
434        }
435        Ok(())
436    }
437}
438
439/// Loads the claim and diff for a commit so the reviewer can run against it.
440/// Abstracted so `drain_once` can be unit-tested without a real git repository.
441pub trait MaterialLoader {
442    fn load(&self, sha: &str) -> Result<(Claim, String), ReviewerError>;
443}
444
445#[derive(Clone, Copy, Debug, Default)]
446pub struct GitMaterialLoader;
447
448impl MaterialLoader for GitMaterialLoader {
449    fn load(&self, sha: &str) -> Result<(Claim, String), ReviewerError> {
450        let message = git_output(["show", "--format=%B", "--no-patch", sha])?;
451        let diff = git_output(["show", "--format=", "--patch", sha])?;
452        let claim = Claim::parse(&message)?;
453        Ok((claim, diff))
454    }
455}
456
457#[derive(Clone, Debug, Default, Eq, PartialEq)]
458pub struct DrainReport {
459    pub reviewed: Vec<String>,
460    pub ledger_entries: usize,
461}
462
463/// Review every distinct queued commit exactly once, record verdicts, and remove
464/// each commit from the queue as soon as its review lands. A commit whose review
465/// errors stays queued for the next drain.
466pub fn drain_once<R: ProcessRunner, L: MaterialLoader>(
467    queue: &ReviewQueue,
468    loader: &L,
469    selection: &ReviewSelection,
470    runner: &R,
471    store: &LedgerStore,
472) -> Result<DrainReport, ReviewerError> {
473    let pending = queue.pending()?;
474    let mut seen = std::collections::BTreeSet::new();
475    let mut order = Vec::new();
476    for item in &pending {
477        if seen.insert(item.commit_sha.clone()) {
478            order.push(item.commit_sha.clone());
479        }
480    }
481
482    let mut report = DrainReport::default();
483    for sha in order {
484        let (claim, diff) = loader.load(&sha)?;
485        let prompt = first_pass_prompt(&claim, &diff);
486        let job = ReviewJob {
487            commit_sha: sha.clone(),
488            claim,
489            diff,
490            request: selection.request_for(prompt),
491            strict: selection.strict.clone(),
492        };
493        let execution = execute_review_job(job, runner, store)?;
494        report.ledger_entries += execution.entries.len();
495        queue.remove_sha(&sha)?;
496        report.reviewed.push(sha);
497    }
498
499    Ok(report)
500}
501
502pub fn run_watch_command(
503    args: cli::WatchArgs,
504    state_dir: &Path,
505    config: &config::TruthMirrorConfig,
506) -> Result<ExitCode> {
507    let selection = ReviewSelection::resolve(
508        args.watched_agent,
509        args.reviewer_harness,
510        args.watched_model,
511        args.reviewer_model,
512        args.allow_same_model,
513        None,
514        config,
515    )?;
516    let queue = ReviewQueue::new(state_dir);
517    let store = LedgerStore::new(state_dir);
518    let loader = GitMaterialLoader;
519    let runner = StdProcessRunner;
520
521    if args.once {
522        let report = drain_once(&queue, &loader, &selection, &runner, &store)?;
523        println!(
524            "truth-mirror watch: reviewed {} commit(s), wrote {} ledger entrie(s)",
525            report.reviewed.len(),
526            report.ledger_entries
527        );
528        return Ok(ExitCode::SUCCESS);
529    }
530
531    let interval = std::time::Duration::from_secs(args.poll_secs.max(1));
532    loop {
533        let report = drain_once(&queue, &loader, &selection, &runner, &store)?;
534        if !report.reviewed.is_empty() {
535            println!(
536                "truth-mirror watch: reviewed {} commit(s)",
537                report.reviewed.len()
538            );
539        }
540        std::thread::sleep(interval);
541    }
542}
543
544#[derive(Clone, Copy, Debug, Eq, PartialEq)]
545pub struct StrictGoalPolicy {
546    pub stop_after_lies: u32,
547    pub stop_after_fuckups: u32,
548}
549
550#[derive(Clone, Copy, Debug, Eq, PartialEq)]
551pub struct StrictGoalCounters {
552    pub lies_exposed: u32,
553    pub fuckups_registered: u32,
554}
555
556#[derive(Clone, Copy, Debug, Eq, PartialEq)]
557pub enum StrictGoalDecision {
558    Continue,
559    Stop { reason: StrictGoalStopReason },
560}
561
562#[derive(Clone, Copy, Debug, Eq, PartialEq)]
563pub enum StrictGoalStopReason {
564    LiesExposed,
565    FuckupsRegistered,
566}
567
568impl StrictGoalPolicy {
569    pub fn decide(&self, counters: StrictGoalCounters) -> StrictGoalDecision {
570        if self.stop_after_lies > 0 && counters.lies_exposed >= self.stop_after_lies {
571            return StrictGoalDecision::Stop {
572                reason: StrictGoalStopReason::LiesExposed,
573            };
574        }
575
576        if self.stop_after_fuckups > 0 && counters.fuckups_registered >= self.stop_after_fuckups {
577            return StrictGoalDecision::Stop {
578                reason: StrictGoalStopReason::FuckupsRegistered,
579            };
580        }
581
582        StrictGoalDecision::Continue
583    }
584}
585
586#[derive(Clone, Debug, Eq, PartialEq)]
587pub struct StrictGoalOutcome {
588    pub passes: u32,
589    pub counters: StrictGoalCounters,
590    /// `None` means the loop stopped at the `max_passes` ceiling rather than
591    /// hitting a configured lie/fuckup threshold.
592    pub stop_reason: Option<StrictGoalStopReason>,
593    pub entries: Vec<LedgerEntry>,
594}
595
596impl StrictGoalOutcome {
597    pub fn stop_reason_suffix(&self) -> &'static str {
598        match self.stop_reason {
599            Some(StrictGoalStopReason::LiesExposed) => " (stopped: lies exposed)",
600            Some(StrictGoalStopReason::FuckupsRegistered) => " (stopped: fuckups registered)",
601            None => " (stopped: max passes)",
602        }
603    }
604}
605
606/// Sic the adversarial reviewer on a commit in a loop, accumulating exposed lies
607/// (REJECT verdicts) and registered fuckups (individual findings). Every pass is
608/// recorded in the ledger. The loop stops when `policy` says the configured `N`
609/// is reached, or when `max_passes` is hit so an honest agent still terminates.
610#[allow(clippy::too_many_arguments)]
611pub fn run_strict_goal_loop<R: ProcessRunner>(
612    commit_sha: &str,
613    claim: &Claim,
614    diff: &str,
615    selection: &ReviewSelection,
616    policy: StrictGoalPolicy,
617    max_passes: u32,
618    runner: &R,
619    store: &LedgerStore,
620) -> Result<StrictGoalOutcome, ReviewerError> {
621    let ceiling = max_passes.max(1);
622    let mut outcome = StrictGoalOutcome {
623        passes: 0,
624        counters: StrictGoalCounters {
625            lies_exposed: 0,
626            fuckups_registered: 0,
627        },
628        stop_reason: None,
629        entries: Vec::new(),
630    };
631
632    while outcome.passes < ceiling {
633        let prompt = strict_goal_prompt(claim, diff, outcome.passes + 1, &outcome.entries);
634        let request = selection.request_for(prompt);
635        let plan = ReviewPlan::build(request.clone())?;
636        let output = plan.run_with(&request.prompt, runner)?;
637        ensure_process_success(&output)?;
638        let verdict = ParsedVerdict::parse(&output.stdout)?;
639
640        let job = ReviewJob {
641            commit_sha: commit_sha.to_owned(),
642            claim: claim.clone(),
643            diff: diff.to_owned(),
644            request,
645            strict: None,
646        };
647        let entry = entry_from_verdict(&job, &plan, &verdict);
648        store.append_entry(&entry)?;
649        outcome.entries.push(entry);
650
651        outcome.passes += 1;
652        if verdict.verdict == Verdict::Reject {
653            outcome.counters.lies_exposed += 1;
654        }
655        outcome.counters.fuckups_registered = outcome
656            .counters
657            .fuckups_registered
658            .saturating_add(u32::try_from(verdict.findings.len()).unwrap_or(u32::MAX));
659
660        if let StrictGoalDecision::Stop { reason } = policy.decide(outcome.counters) {
661            outcome.stop_reason = Some(reason);
662            break;
663        }
664    }
665
666    Ok(outcome)
667}
668
669fn strict_goal_prompt(claim: &Claim, diff: &str, pass: u32, prior: &[LedgerEntry]) -> String {
670    let prior_findings: Vec<String> = prior
671        .iter()
672        .flat_map(|entry| entry.findings.clone())
673        .collect();
674    let prior_block = if prior_findings.is_empty() {
675        "(none)".to_owned()
676    } else {
677        prior_findings.join("\n")
678    };
679    format!(
680        "Adversarial strict-goal review, pass {pass}. Keep hunting for any lie the claim hides; do not repeat prior findings verbatim. Reply with 'VERDICT: PASS' or 'VERDICT: REJECT' and a FINDINGS section.\n\nCLAIM:\n{}\n\nPRIOR FINDINGS:\n{prior_block}\n\nDIFF:\n{}",
681        claim.to_line(),
682        diff
683    )
684}
685
686pub fn run_review_command(
687    args: cli::ReviewArgs,
688    state_dir: &Path,
689    config: &config::TruthMirrorConfig,
690) -> Result<ExitCode> {
691    let material = ReviewMaterial::load(&args, state_dir)?;
692
693    let strict = if args.strict_two_pass {
694        Some(StrictReviewConfig {
695            arbiter_harness: args
696                .arbiter_harness
697                .context("--strict-two-pass requires --arbiter-harness")?,
698            arbiter_model: args
699                .arbiter_model
700                .context("--strict-two-pass requires --arbiter-model")?,
701        })
702    } else {
703        None
704    };
705
706    let selection = ReviewSelection::resolve(
707        args.watched_agent,
708        args.reviewer_harness,
709        args.watched_model,
710        args.reviewer_model,
711        args.allow_same_model,
712        strict,
713        config,
714    )?;
715    let store = LedgerStore::new(state_dir);
716
717    if args.strict_goal {
718        let policy = config
719            .strict
720            .goal_policy(args.stop_after_lies, args.stop_after_fuckups);
721        let max_passes = args.max_passes.unwrap_or(config.strict.max_passes);
722        let outcome = run_strict_goal_loop(
723            &material.commit_sha,
724            &material.claim,
725            &material.diff,
726            &selection,
727            policy,
728            max_passes,
729            &StdProcessRunner,
730            &store,
731        )?;
732        println!(
733            "truth-mirror strict-goal: {} pass(es), {} lie(s), {} fuckup(s){}",
734            outcome.passes,
735            outcome.counters.lies_exposed,
736            outcome.counters.fuckups_registered,
737            outcome.stop_reason_suffix(),
738        );
739        return Ok(ExitCode::SUCCESS);
740    }
741
742    let prompt = first_pass_prompt(&material.claim, &material.diff);
743    let job = ReviewJob {
744        commit_sha: material.commit_sha,
745        claim: material.claim,
746        diff: material.diff,
747        request: selection.request_for(prompt),
748        strict: selection.strict.clone(),
749    };
750
751    execute_review_job(job, &StdProcessRunner, &store)?;
752    Ok(ExitCode::SUCCESS)
753}
754
755#[derive(Clone, Debug, Eq, PartialEq)]
756struct ReviewMaterial {
757    commit_sha: String,
758    claim: Claim,
759    diff: String,
760}
761
762impl ReviewMaterial {
763    fn load(args: &cli::ReviewArgs, state_dir: &Path) -> Result<Self, ReviewerError> {
764        if args.staged {
765            let diff = git_output(["diff", "--cached"])?;
766            let claim_path = state_dir.join("claim.txt");
767            let claim_text =
768                fs::read_to_string(&claim_path).map_err(|source| ReviewerError::ClaimFileRead {
769                    path: claim_path.clone(),
770                    source,
771                })?;
772            let claim = Claim::parse(&claim_text)?;
773            return Ok(Self {
774                commit_sha: "STAGED".to_owned(),
775                claim,
776                diff,
777            });
778        }
779
780        let sha = args
781            .target
782            .clone()
783            .ok_or(ReviewerError::MissingReviewTarget)?;
784        let message = git_output(["show", "--format=%B", "--no-patch", sha.as_str()])?;
785        let diff = git_output(["show", "--format=", "--patch", sha.as_str()])?;
786        let claim = Claim::parse(&message)?;
787        Ok(Self {
788            commit_sha: sha,
789            claim,
790            diff,
791        })
792    }
793}
794
795#[derive(Debug, Error)]
796pub enum ReviewerError {
797    #[error("missing {role} model")]
798    MissingModel { role: String },
799    #[error(
800        "same reviewer model is disallowed without --allow-same-model: watched={watched_model}, reviewer={reviewer_model}"
801    )]
802    SameModelWithoutWaiver {
803        watched_model: String,
804        reviewer_model: String,
805    },
806    #[error("strict arbiter model must differ from watched and first reviewer models")]
807    StrictArbiterModelNotDistinct,
808    #[error("custom reviewer harness requires explicit command configuration")]
809    UnsupportedCustomHarness,
810    #[error("unknown watched agent {value:?}")]
811    UnknownAgent { value: String },
812    #[error("unknown reviewer harness {value:?}")]
813    UnknownHarness { value: String },
814    #[error("missing review target")]
815    MissingReviewTarget,
816    #[error("failed to read staged claim file {path}: {source}")]
817    ClaimFileRead {
818        path: PathBuf,
819        #[source]
820        source: io::Error,
821    },
822    #[error("reviewer output did not contain VERDICT: PASS or VERDICT: REJECT: {output:?}")]
823    VerdictParse { output: String },
824    #[error("reviewer process exited with status {status:?}: {stderr}")]
825    ReviewerProcessFailed { status: Option<i32>, stderr: String },
826    #[error("git command failed: git {args:?}: {stderr}")]
827    GitFailed { args: Vec<String>, stderr: String },
828    #[error("failed to spawn git command: {0}")]
829    GitSpawn(io::Error),
830    #[error("failed to spawn reviewer process: {0}")]
831    Spawn(io::Error),
832    #[error("failed to open reviewer stdin pipe")]
833    MissingStdinPipe,
834    #[error("failed to write reviewer prompt: {0}")]
835    WritePrompt(io::Error),
836    #[error("failed to wait for reviewer process: {0}")]
837    Wait(io::Error),
838    #[error("review queue IO failed: {0}")]
839    QueueIo(io::Error),
840    #[error("review queue JSON failed: {0}")]
841    QueueJson(serde_json::Error),
842    #[error(transparent)]
843    Claim(#[from] crate::claim::ClaimError),
844    #[error(transparent)]
845    Ledger(#[from] crate::ledger::LedgerError),
846}
847
848fn first_pass_prompt(claim: &Claim, diff: &str) -> String {
849    format!(
850        "Review this commit claim against the diff. Reply with 'VERDICT: PASS' or 'VERDICT: REJECT' and a FINDINGS section.\n\n{}\n\nDIFF:\n{}",
851        claim.to_line(),
852        diff
853    )
854}
855
856fn strict_second_pass_prompt(job: &ReviewJob, first_output: &str) -> String {
857    format!(
858        "Strict second-pass review. Try to falsify the first reviewer's clean verdict. Reply with 'VERDICT: PASS' or 'VERDICT: REJECT' and a FINDINGS section.\n\nCLAIM:\n{}\n\nFIRST REVIEW:\n{}\n\nDIFF:\n{}",
859        job.claim.to_line(),
860        first_output,
861        job.diff
862    )
863}
864
865fn entry_from_verdict(job: &ReviewJob, plan: &ReviewPlan, verdict: &ParsedVerdict) -> LedgerEntry {
866    LedgerEntry::new(
867        job.commit_sha.clone(),
868        verdict.verdict,
869        job.claim.to_line(),
870        job.claim
871            .evidence
872            .iter()
873            .map(EvidenceRef::as_str)
874            .map(str::to_owned)
875            .collect(),
876        plan.reviewer_config(),
877        verdict.findings.clone(),
878    )
879}
880
881fn parse_verdict_line(line: &str) -> Option<Verdict> {
882    let normalized = line.trim().to_ascii_uppercase();
883    if normalized == "PASS" || normalized == "VERDICT: PASS" {
884        Some(Verdict::Pass)
885    } else if normalized == "REJECT" || normalized == "VERDICT: REJECT" {
886        Some(Verdict::Reject)
887    } else {
888        None
889    }
890}
891
892fn parse_findings(output: &str) -> Vec<String> {
893    let mut in_findings = false;
894    let mut findings = Vec::new();
895    for line in output.lines() {
896        let trimmed = line.trim();
897        if trimmed.eq_ignore_ascii_case("FINDINGS:") {
898            in_findings = true;
899            continue;
900        }
901
902        if !in_findings || trimmed.is_empty() {
903            continue;
904        }
905
906        if trimmed.to_ascii_uppercase().starts_with("VERDICT:") {
907            continue;
908        }
909
910        findings.push(trimmed.trim_start_matches("- ").to_owned());
911    }
912    findings
913}
914
915fn ensure_process_success(output: &ProcessOutput) -> Result<(), ReviewerError> {
916    if output.status_code == Some(0) {
917        return Ok(());
918    }
919
920    Err(ReviewerError::ReviewerProcessFailed {
921        status: output.status_code,
922        stderr: output.stderr.clone(),
923    })
924}
925
926fn validate_strict_arbiter(
927    request: &ReviewRequest,
928    strict: &StrictReviewConfig,
929) -> Result<(), ReviewerError> {
930    let arbiter = normalized_model(&strict.arbiter_model);
931    if arbiter == normalized_model(&request.watched_model)
932        || arbiter == normalized_model(&request.reviewer_model)
933    {
934        return Err(ReviewerError::StrictArbiterModelNotDistinct);
935    }
936    Ok(())
937}
938
939fn validate_model_present(role: &str, model: &str) -> Result<(), ReviewerError> {
940    if model.trim().is_empty() {
941        return Err(ReviewerError::MissingModel {
942            role: role.to_owned(),
943        });
944    }
945    Ok(())
946}
947
948fn git_output<const N: usize>(args: [&str; N]) -> Result<String, ReviewerError> {
949    let output = Command::new("git")
950        .args(args)
951        .output()
952        .map_err(ReviewerError::GitSpawn)?;
953    if !output.status.success() {
954        return Err(ReviewerError::GitFailed {
955            args: args.iter().map(|arg| (*arg).to_owned()).collect(),
956            stderr: String::from_utf8_lossy(&output.stderr).into_owned(),
957        });
958    }
959
960    Ok(String::from_utf8_lossy(&output.stdout).into_owned())
961}
962
963fn agent_from_slug(value: &str) -> Result<Agent, ReviewerError> {
964    match value.trim().to_ascii_lowercase().as_str() {
965        "claude" => Ok(Agent::Claude),
966        "codex" => Ok(Agent::Codex),
967        "pi" => Ok(Agent::Pi),
968        _ => Err(ReviewerError::UnknownAgent {
969            value: value.to_owned(),
970        }),
971    }
972}
973
974fn harness_from_slug(value: &str) -> Result<ReviewerHarness, ReviewerError> {
975    match value.trim().to_ascii_lowercase().as_str() {
976        "claude" => Ok(ReviewerHarness::Claude),
977        "codex" => Ok(ReviewerHarness::Codex),
978        "pi" => Ok(ReviewerHarness::Pi),
979        "gemini" => Ok(ReviewerHarness::Gemini),
980        "opencode" => Ok(ReviewerHarness::Opencode),
981        "custom" => Ok(ReviewerHarness::Custom),
982        _ => Err(ReviewerError::UnknownHarness {
983            value: value.to_owned(),
984        }),
985    }
986}
987
988fn harness_slug(harness: ReviewerHarness) -> &'static str {
989    match harness {
990        ReviewerHarness::Claude => "claude",
991        ReviewerHarness::Codex => "codex",
992        ReviewerHarness::Pi => "pi",
993        ReviewerHarness::Gemini => "gemini",
994        ReviewerHarness::Opencode => "opencode",
995        ReviewerHarness::Custom => "custom",
996    }
997}
998
999fn normalized_model(model: &str) -> String {
1000    model.trim().to_ascii_lowercase()
1001}
1002
1003fn unix_now() -> u64 {
1004    SystemTime::now()
1005        .duration_since(UNIX_EPOCH)
1006        .map_or(0, |duration| duration.as_secs())
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011    use std::{cell::RefCell, collections::VecDeque};
1012
1013    use proptest::prelude::*;
1014
1015    use super::{
1016        InvocationPlan, MaterialLoader, ParsedVerdict, ProcessOutput, ProcessRunner,
1017        PromptDelivery, ReviewJob, ReviewPlan, ReviewQueue, ReviewRequest, ReviewSelection,
1018        ReviewerError, StrictGoalCounters, StrictGoalDecision, StrictGoalPolicy,
1019        StrictGoalStopReason, StrictReviewConfig, drain_once, execute_review_job,
1020        run_strict_goal_loop,
1021    };
1022    use crate::{
1023        claim::{Claim, EvidenceRef},
1024        cli::{Agent, ReviewerHarness},
1025        ledger::{LedgerStore, Verdict},
1026    };
1027
1028    #[test]
1029    fn same_harness_different_model_is_valid() {
1030        let request = ReviewRequest::new(
1031            Agent::Codex,
1032            "gpt-5.4",
1033            ReviewerHarness::Codex,
1034            "gpt-5.5",
1035            false,
1036            "review this",
1037        );
1038
1039        let plan = ReviewPlan::build(request).unwrap();
1040
1041        assert_eq!(plan.watched_agent, Agent::Codex);
1042        assert_eq!(plan.reviewer_harness, ReviewerHarness::Codex);
1043        assert_eq!(plan.invocation.program, "codex");
1044    }
1045
1046    #[test]
1047    fn same_model_is_blocked_by_default() {
1048        let request = ReviewRequest::new(
1049            Agent::Codex,
1050            " GPT-5.5 ",
1051            ReviewerHarness::Claude,
1052            "gpt-5.5",
1053            false,
1054            "review this",
1055        );
1056
1057        let error = ReviewPlan::build(request).unwrap_err();
1058
1059        assert!(matches!(
1060            error,
1061            ReviewerError::SameModelWithoutWaiver { .. }
1062        ));
1063    }
1064
1065    #[test]
1066    fn allow_same_model_override_is_deliberate() {
1067        let request = ReviewRequest::new(
1068            Agent::Codex,
1069            "gpt-5.5",
1070            ReviewerHarness::Codex,
1071            "gpt-5.5",
1072            true,
1073            "review this",
1074        );
1075
1076        let plan = ReviewPlan::build(request).unwrap();
1077
1078        assert!(plan.allow_same_model);
1079        assert_eq!(plan.reviewer_model, "gpt-5.5");
1080    }
1081
1082    #[test]
1083    fn provider_mapping_uses_verified_prompt_shapes() {
1084        let codex = InvocationPlan::for_harness(ReviewerHarness::Codex, "gpt-5.5").unwrap();
1085        assert_eq!(codex.program, "codex");
1086        assert_eq!(
1087            codex.args_for_prompt("prompt"),
1088            ["exec", "-m", "gpt-5.5", "prompt"]
1089        );
1090
1091        let claude = InvocationPlan::for_harness(ReviewerHarness::Claude, "opus").unwrap();
1092        assert_eq!(claude.program, "claude");
1093        assert_eq!(claude.prompt_delivery, PromptDelivery::Stdin);
1094        assert_eq!(
1095            claude.args_for_prompt("prompt"),
1096            ["--print", "--model", "opus"]
1097        );
1098
1099        let gemini = InvocationPlan::for_harness(ReviewerHarness::Gemini, "gemini-pro").unwrap();
1100        assert_eq!(
1101            gemini.args_for_prompt("prompt"),
1102            ["-m", "gemini-pro", "-p", "prompt"]
1103        );
1104
1105        let pi = InvocationPlan::for_harness(ReviewerHarness::Pi, "openai/gpt-5.5").unwrap();
1106        assert_eq!(pi.prompt_delivery, PromptDelivery::Stdin);
1107        assert_eq!(
1108            pi.args_for_prompt("prompt"),
1109            ["--model", "openai/gpt-5.5", "-p"]
1110        );
1111    }
1112
1113    #[test]
1114    fn custom_harness_requires_explicit_configuration() {
1115        let error = InvocationPlan::for_harness(ReviewerHarness::Custom, "model").unwrap_err();
1116
1117        assert!(matches!(error, ReviewerError::UnsupportedCustomHarness));
1118    }
1119
1120    #[test]
1121    fn subprocess_runner_is_mockable() {
1122        struct MockRunner;
1123
1124        impl ProcessRunner for MockRunner {
1125            fn run(
1126                &self,
1127                invocation: &InvocationPlan,
1128                prompt: &str,
1129            ) -> Result<ProcessOutput, ReviewerError> {
1130                assert_eq!(invocation.program, "codex");
1131                assert_eq!(
1132                    invocation.args_for_prompt(prompt).last().unwrap(),
1133                    "review this"
1134                );
1135                Ok(ProcessOutput {
1136                    status_code: Some(0),
1137                    stdout: "VERDICT: PASS\nFINDINGS:\n".to_owned(),
1138                    stderr: String::new(),
1139                })
1140            }
1141        }
1142
1143        let request = ReviewRequest::new(
1144            Agent::Codex,
1145            "gpt-5.4",
1146            ReviewerHarness::Codex,
1147            "gpt-5.5",
1148            false,
1149            "review this",
1150        );
1151        let plan = ReviewPlan::build(request).unwrap();
1152        let output = plan.run_with("review this", &MockRunner).unwrap();
1153
1154        assert!(output.stdout.contains("PASS"));
1155    }
1156
1157    #[test]
1158    fn verdict_parser_extracts_rejection_findings() {
1159        let verdict =
1160            ParsedVerdict::parse("VERDICT: REJECT\nFINDINGS:\n- missing proof\n").unwrap();
1161
1162        assert_eq!(verdict.verdict, Verdict::Reject);
1163        assert_eq!(verdict.findings, ["missing proof"]);
1164    }
1165
1166    #[test]
1167    fn review_queue_schedules_commits_without_running_models() {
1168        let temp = tempfile::tempdir().unwrap();
1169        let queue = ReviewQueue::new(temp.path());
1170
1171        queue.enqueue("abc123").unwrap();
1172
1173        let pending = queue.pending().unwrap();
1174        assert_eq!(pending.len(), 1);
1175        assert_eq!(pending[0].commit_sha, "abc123");
1176    }
1177
1178    #[test]
1179    fn execute_review_records_reject_verdict() {
1180        let temp = tempfile::tempdir().unwrap();
1181        let store = LedgerStore::new(temp.path());
1182        let job = review_job(false);
1183        let runner = SequenceRunner::new(["VERDICT: REJECT\nFINDINGS:\n- unsupported\n"]);
1184
1185        let execution = execute_review_job(job, &runner, &store).unwrap();
1186
1187        assert_eq!(execution.entries.len(), 1);
1188        assert_eq!(execution.entries[0].verdict, Verdict::Reject);
1189        assert_eq!(store.unresolved_rejections().unwrap().len(), 1);
1190    }
1191
1192    #[test]
1193    fn strict_two_pass_records_both_clean_passes() {
1194        let temp = tempfile::tempdir().unwrap();
1195        let store = LedgerStore::new(temp.path());
1196        let job = review_job(true);
1197        let runner =
1198            SequenceRunner::new(["VERDICT: PASS\nFINDINGS:\n", "VERDICT: PASS\nFINDINGS:\n"]);
1199
1200        let execution = execute_review_job(job, &runner, &store).unwrap();
1201
1202        assert_eq!(execution.entries.len(), 2);
1203        assert_eq!(store.read_history().unwrap().len(), 2);
1204        assert_eq!(execution.entries[0].reviewer.model, "gpt-5.5");
1205        assert_eq!(execution.entries[1].reviewer.model, "claude-opus-4-1");
1206    }
1207
1208    #[test]
1209    fn strict_arbiter_model_must_be_third_model() {
1210        let temp = tempfile::tempdir().unwrap();
1211        let store = LedgerStore::new(temp.path());
1212        let mut job = review_job(true);
1213        job.strict.as_mut().unwrap().arbiter_model = "gpt-5.5".to_owned();
1214        let runner = SequenceRunner::new(["VERDICT: PASS\nFINDINGS:\n"]);
1215
1216        let error = execute_review_job(job, &runner, &store).unwrap_err();
1217
1218        assert!(matches!(
1219            error,
1220            ReviewerError::StrictArbiterModelNotDistinct
1221        ));
1222    }
1223
1224    #[test]
1225    fn strict_goal_policy_stops_at_configured_lie_or_fuckup_count() {
1226        let policy = StrictGoalPolicy {
1227            stop_after_lies: 2,
1228            stop_after_fuckups: 3,
1229        };
1230
1231        assert_eq!(
1232            policy.decide(StrictGoalCounters {
1233                lies_exposed: 1,
1234                fuckups_registered: 2
1235            }),
1236            StrictGoalDecision::Continue
1237        );
1238        assert_eq!(
1239            policy.decide(StrictGoalCounters {
1240                lies_exposed: 2,
1241                fuckups_registered: 0
1242            }),
1243            StrictGoalDecision::Stop {
1244                reason: StrictGoalStopReason::LiesExposed
1245            }
1246        );
1247        assert_eq!(
1248            policy.decide(StrictGoalCounters {
1249                lies_exposed: 0,
1250                fuckups_registered: 3
1251            }),
1252            StrictGoalDecision::Stop {
1253                reason: StrictGoalStopReason::FuckupsRegistered
1254            }
1255        );
1256    }
1257
1258    #[test]
1259    fn drain_once_reviews_each_commit_once_and_clears_queue() {
1260        let temp = tempfile::tempdir().unwrap();
1261        let store = LedgerStore::new(temp.path());
1262        let queue = ReviewQueue::new(temp.path());
1263        queue.enqueue("abc123").unwrap();
1264        queue.enqueue("abc123").unwrap(); // duplicate SHA reviewed only once
1265        queue.enqueue("def456").unwrap();
1266
1267        let loader = StaticLoader::new();
1268        let runner = SequenceRunner::new([
1269            "VERDICT: REJECT\nFINDINGS:\n- unsupported\n",
1270            "VERDICT: PASS\nFINDINGS:\n",
1271        ]);
1272        let selection = selection();
1273
1274        let report = drain_once(&queue, &loader, &selection, &runner, &store).unwrap();
1275
1276        assert_eq!(report.reviewed, ["abc123", "def456"]);
1277        assert_eq!(report.ledger_entries, 2);
1278        assert!(queue.pending().unwrap().is_empty());
1279        assert_eq!(store.read_history().unwrap().len(), 2);
1280        assert_eq!(store.unresolved_rejections().unwrap().len(), 1);
1281    }
1282
1283    #[test]
1284    fn drain_once_is_a_noop_on_empty_queue() {
1285        let temp = tempfile::tempdir().unwrap();
1286        let store = LedgerStore::new(temp.path());
1287        let queue = ReviewQueue::new(temp.path());
1288        let loader = StaticLoader::new();
1289        let runner = ConstRunner::new("VERDICT: PASS\nFINDINGS:\n");
1290
1291        let report = drain_once(&queue, &loader, &selection(), &runner, &store).unwrap();
1292
1293        assert!(report.reviewed.is_empty());
1294        assert_eq!(report.ledger_entries, 0);
1295        assert_eq!(store.read_history().unwrap().len(), 0);
1296    }
1297
1298    #[test]
1299    fn strict_goal_loop_stops_at_configured_lie_count() {
1300        let temp = tempfile::tempdir().unwrap();
1301        let store = LedgerStore::new(temp.path());
1302        let policy = StrictGoalPolicy {
1303            stop_after_lies: 1,
1304            stop_after_fuckups: 0,
1305        };
1306        let runner = SequenceRunner::new(["VERDICT: REJECT\nFINDINGS:\n- lie\n"]);
1307
1308        let outcome = run_strict_goal_loop(
1309            "abc123",
1310            &claim(),
1311            "diff",
1312            &selection(),
1313            policy,
1314            5,
1315            &runner,
1316            &store,
1317        )
1318        .unwrap();
1319
1320        assert_eq!(outcome.passes, 1);
1321        assert_eq!(outcome.counters.lies_exposed, 1);
1322        assert_eq!(outcome.stop_reason, Some(StrictGoalStopReason::LiesExposed));
1323        assert_eq!(store.read_history().unwrap().len(), 1);
1324    }
1325
1326    #[test]
1327    fn strict_goal_loop_terminates_at_max_passes_for_honest_agent() {
1328        let temp = tempfile::tempdir().unwrap();
1329        let store = LedgerStore::new(temp.path());
1330        let policy = StrictGoalPolicy {
1331            stop_after_lies: 2,
1332            stop_after_fuckups: 5,
1333        };
1334        let runner = ConstRunner::new("VERDICT: PASS\nFINDINGS:\n");
1335
1336        let outcome = run_strict_goal_loop(
1337            "abc123",
1338            &claim(),
1339            "diff",
1340            &selection(),
1341            policy,
1342            3,
1343            &runner,
1344            &store,
1345        )
1346        .unwrap();
1347
1348        assert_eq!(outcome.passes, 3);
1349        assert_eq!(outcome.counters.lies_exposed, 0);
1350        assert_eq!(outcome.stop_reason, None);
1351        assert_eq!(store.read_history().unwrap().len(), 3);
1352    }
1353
1354    #[test]
1355    fn strict_goal_loop_stops_when_fuckups_accumulate() {
1356        let temp = tempfile::tempdir().unwrap();
1357        let store = LedgerStore::new(temp.path());
1358        let policy = StrictGoalPolicy {
1359            stop_after_lies: 0,
1360            stop_after_fuckups: 2,
1361        };
1362        // Each PASS-with-findings pass registers one fuckup; two passes hit N=2.
1363        let runner = ConstRunner::new("VERDICT: PASS\nFINDINGS:\n- nit\n");
1364
1365        let outcome = run_strict_goal_loop(
1366            "abc123",
1367            &claim(),
1368            "diff",
1369            &selection(),
1370            policy,
1371            10,
1372            &runner,
1373            &store,
1374        )
1375        .unwrap();
1376
1377        assert_eq!(outcome.passes, 2);
1378        assert_eq!(outcome.counters.lies_exposed, 0);
1379        assert_eq!(outcome.counters.fuckups_registered, 2);
1380        assert_eq!(
1381            outcome.stop_reason,
1382            Some(StrictGoalStopReason::FuckupsRegistered)
1383        );
1384    }
1385
1386    proptest! {
1387        #[test]
1388        fn strict_goal_loop_never_exceeds_max_passes(max in 1u32..6) {
1389            let temp = tempfile::tempdir().unwrap();
1390            let store = LedgerStore::new(temp.path());
1391            // Both thresholds disabled: only the ceiling can stop the loop.
1392            let policy = StrictGoalPolicy { stop_after_lies: 0, stop_after_fuckups: 0 };
1393            let runner = ConstRunner::new("VERDICT: PASS\nFINDINGS:\n");
1394
1395            let outcome = run_strict_goal_loop(
1396                "abc123", &claim(), "diff", &selection(), policy, max, &runner, &store,
1397            )
1398            .unwrap();
1399
1400            prop_assert!(outcome.passes <= max);
1401            prop_assert_eq!(outcome.passes, max);
1402            prop_assert!(outcome.stop_reason.is_none());
1403        }
1404    }
1405
1406    proptest! {
1407        #[test]
1408        fn model_opposition_is_enforced_for_arbitrary_models(
1409            watched in "[A-Za-z0-9._/-]{1,32}",
1410            reviewer in "[A-Za-z0-9._/-]{1,32}",
1411        ) {
1412            let request = ReviewRequest::new(
1413                Agent::Codex,
1414                watched.clone(),
1415                ReviewerHarness::Codex,
1416                reviewer.clone(),
1417                false,
1418                "review this",
1419            );
1420            let result = ReviewPlan::build(request);
1421
1422            if watched.trim().eq_ignore_ascii_case(reviewer.trim()) {
1423                let blocked = matches!(result, Err(ReviewerError::SameModelWithoutWaiver { .. }));
1424                prop_assert!(blocked);
1425            } else {
1426                prop_assert!(result.is_ok());
1427            }
1428        }
1429    }
1430
1431    fn claim() -> Claim {
1432        Claim::new(
1433            "add review",
1434            "cargo test",
1435            vec![EvidenceRef::parse("tests:cargo-test").unwrap()],
1436        )
1437        .unwrap()
1438    }
1439
1440    fn selection() -> ReviewSelection {
1441        ReviewSelection {
1442            watched_agent: Agent::Codex,
1443            watched_model: "gpt-5.4".to_owned(),
1444            reviewer_harness: ReviewerHarness::Codex,
1445            reviewer_model: "gpt-5.5".to_owned(),
1446            allow_same_model: false,
1447            strict: None,
1448        }
1449    }
1450
1451    struct StaticLoader {
1452        claim: Claim,
1453        diff: String,
1454    }
1455
1456    impl StaticLoader {
1457        fn new() -> Self {
1458            Self {
1459                claim: claim(),
1460                diff: "diff --git a/src/lib.rs b/src/lib.rs".to_owned(),
1461            }
1462        }
1463    }
1464
1465    impl MaterialLoader for StaticLoader {
1466        fn load(&self, _sha: &str) -> Result<(Claim, String), ReviewerError> {
1467            Ok((self.claim.clone(), self.diff.clone()))
1468        }
1469    }
1470
1471    struct ConstRunner {
1472        output: String,
1473    }
1474
1475    impl ConstRunner {
1476        fn new(output: &str) -> Self {
1477            Self {
1478                output: output.to_owned(),
1479            }
1480        }
1481    }
1482
1483    impl ProcessRunner for ConstRunner {
1484        fn run(
1485            &self,
1486            _invocation: &InvocationPlan,
1487            _prompt: &str,
1488        ) -> Result<ProcessOutput, ReviewerError> {
1489            Ok(ProcessOutput {
1490                status_code: Some(0),
1491                stdout: self.output.clone(),
1492                stderr: String::new(),
1493            })
1494        }
1495    }
1496
1497    fn review_job(strict: bool) -> ReviewJob {
1498        let claim = claim();
1499        ReviewJob {
1500            commit_sha: "abc123".to_owned(),
1501            diff: "diff --git a/src/lib.rs b/src/lib.rs".to_owned(),
1502            request: ReviewRequest::new(
1503                Agent::Codex,
1504                "gpt-5.4",
1505                ReviewerHarness::Codex,
1506                "gpt-5.5",
1507                false,
1508                "review this",
1509            ),
1510            claim,
1511            strict: strict.then_some(StrictReviewConfig {
1512                arbiter_harness: ReviewerHarness::Claude,
1513                arbiter_model: "claude-opus-4-1".to_owned(),
1514            }),
1515        }
1516    }
1517
1518    struct SequenceRunner {
1519        outputs: RefCell<VecDeque<String>>,
1520    }
1521
1522    impl SequenceRunner {
1523        fn new<const N: usize>(outputs: [&str; N]) -> Self {
1524            Self {
1525                outputs: RefCell::new(outputs.into_iter().map(str::to_owned).collect()),
1526            }
1527        }
1528    }
1529
1530    impl ProcessRunner for SequenceRunner {
1531        fn run(
1532            &self,
1533            _invocation: &InvocationPlan,
1534            _prompt: &str,
1535        ) -> Result<ProcessOutput, ReviewerError> {
1536            let stdout = self.outputs.borrow_mut().pop_front().unwrap();
1537            Ok(ProcessOutput {
1538                status_code: Some(0),
1539                stdout,
1540                stderr: String::new(),
1541            })
1542        }
1543    }
1544}