Skip to main content

mdx_rust_core/
safety_pipeline.rs

1//! Candidate safety pipeline.
2//!
3//! This module owns the acceptance-critical path:
4//! hook checks, isolated validation, patched scoring, final landing,
5//! final validation, and rollback.
6
7use crate::hooks::{evaluate_builtin_hook, HookContext, HookDecision, HookPolicy, HookStage};
8use crate::registry::{AgentContract, RegisteredAgent};
9use crate::runner::AgentRunResult;
10use mdx_rust_analysis::editing::{ProposedEdit, ValidationCommandRecord};
11use schemars::JsonSchema;
12use serde::{Deserialize, Serialize};
13use std::path::Path;
14use std::time::{Duration, Instant};
15
16#[derive(Debug, Clone, Copy)]
17pub struct CandidateExecutionConfig<'a> {
18    pub hook_policy: &'a HookPolicy,
19    pub review_before_apply: bool,
20    pub quiet: bool,
21    pub candidate_timeout: Duration,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, thiserror::Error, PartialEq, Eq)]
25pub enum SafetyRejectionKind {
26    #[error("edit scope rejected")]
27    EditScope,
28    #[error("hook denied candidate")]
29    HookDenied,
30    #[error("validation failed")]
31    ValidationFailed,
32    #[error("candidate was not net positive")]
33    NetNegative,
34    #[error("review mode prevented landing")]
35    ReviewOnly,
36    #[error("snapshot failed")]
37    SnapshotFailed,
38    #[error("landing failed")]
39    LandingFailed,
40    #[error("final validation failed")]
41    FinalValidationFailed,
42    #[error("candidate timed out")]
43    Timeout,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
47pub struct SafetyRejection {
48    pub kind: SafetyRejectionKind,
49    pub message: String,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
53pub struct CandidateExecutionOutcome {
54    pub validated: u32,
55    pub landed: u32,
56    pub accepted: u32,
57    pub accepted_diff: Option<String>,
58    pub patched_score: Option<f32>,
59    pub holdout_score: Option<f32>,
60    pub delta: Option<f32>,
61    pub note: String,
62    pub hook_decisions: Vec<HookDecision>,
63    pub validation_commands: Vec<ValidationCommandRecord>,
64    pub final_validation_commands: Vec<ValidationCommandRecord>,
65    pub rollback_succeeded: Option<bool>,
66    pub rollback_error: Option<String>,
67    pub timed_out: bool,
68    #[serde(default)]
69    pub rejection: Option<SafetyRejection>,
70}
71
72impl CandidateExecutionOutcome {
73    fn empty(note: impl Into<String>, hook_decisions: Vec<HookDecision>) -> Self {
74        Self {
75            validated: 0,
76            landed: 0,
77            accepted: 0,
78            accepted_diff: None,
79            patched_score: None,
80            holdout_score: None,
81            delta: None,
82            note: note.into(),
83            hook_decisions,
84            validation_commands: Vec::new(),
85            final_validation_commands: Vec::new(),
86            rollback_succeeded: None,
87            rollback_error: None,
88            timed_out: false,
89            rejection: None,
90        }
91    }
92
93    fn rejected(
94        kind: SafetyRejectionKind,
95        message: impl Into<String>,
96        hook_decisions: Vec<HookDecision>,
97    ) -> Self {
98        let message = message.into();
99        Self {
100            rejection: Some(SafetyRejection {
101                kind,
102                message: message.clone(),
103            }),
104            ..Self::empty(message, hook_decisions)
105        }
106    }
107}
108
109struct ScopedEdit<'a> {
110    edit: &'a ProposedEdit,
111}
112
113struct IsolatedValidatedEdit<'a> {
114    scoped: ScopedEdit<'a>,
115    validation_commands: Vec<ValidationCommandRecord>,
116}
117
118struct NetPositiveEdit<'a> {
119    validated: IsolatedValidatedEdit<'a>,
120    patched_score: f32,
121    delta: f32,
122}
123
124pub struct CandidateExecutionContext<'a> {
125    pub agent: &'a RegisteredAgent,
126    pub config: CandidateExecutionConfig<'a>,
127    pub iteration: u32,
128    pub candidate_index: usize,
129    pub edit: &'a ProposedEdit,
130    pub test_inputs: &'a [serde_json::Value],
131    pub holdout_inputs: &'a [serde_json::Value],
132    pub baseline_score: f32,
133    pub scorer: fn(&AgentRunResult) -> f32,
134}
135
136pub async fn execute_candidate_edit(
137    context: CandidateExecutionContext<'_>,
138) -> CandidateExecutionOutcome {
139    let timeout = context.config.candidate_timeout;
140    match tokio::time::timeout(timeout, execute_candidate_edit_inner(context)).await {
141        Ok(outcome) => outcome,
142        Err(_) => CandidateExecutionOutcome {
143            timed_out: true,
144            ..CandidateExecutionOutcome::rejected(
145                SafetyRejectionKind::Timeout,
146                format!(" (candidate timed out after {}s)", timeout.as_secs()),
147                Vec::new(),
148            )
149        },
150    }
151}
152
153async fn execute_candidate_edit_inner(
154    context: CandidateExecutionContext<'_>,
155) -> CandidateExecutionOutcome {
156    let agent = context.agent;
157    let edit = context.edit;
158    let mut hook_decisions = Vec::new();
159    let deadline_start = Instant::now();
160
161    if let Err(err) = ensure_single_file_patch_scope(&agent.path, edit) {
162        return CandidateExecutionOutcome::rejected(
163            SafetyRejectionKind::EditScope,
164            format!(" (edit scope rejected: {err})"),
165            hook_decisions,
166        );
167    }
168    let scoped_edit = ScopedEdit { edit };
169
170    if deadline_start.elapsed() >= context.config.candidate_timeout {
171        return timed_out_outcome(context.config.candidate_timeout, hook_decisions);
172    }
173
174    let pre_edit = evaluate_builtin_hook(
175        context.config.hook_policy,
176        &HookContext {
177            stage: HookStage::PreEdit,
178            agent_name: agent.name.clone(),
179            edit_description: Some(edit.description.clone()),
180            patch_bytes: edit.patch.len(),
181            command: None,
182            validation_passed: None,
183            score_delta: None,
184        },
185    );
186    let denied = pre_edit.denied();
187    hook_decisions.push(pre_edit);
188    if denied {
189        return CandidateExecutionOutcome::rejected(
190            SafetyRejectionKind::HookDenied,
191            " (pre-edit hook denied candidate)",
192            hook_decisions,
193        );
194    }
195
196    let pre_command = evaluate_builtin_hook(
197        context.config.hook_policy,
198        &HookContext {
199            stage: HookStage::PreCommand,
200            agent_name: agent.name.clone(),
201            edit_description: Some(edit.description.clone()),
202            patch_bytes: edit.patch.len(),
203            command: Some("cargo check && cargo clippy -- -D warnings".to_string()),
204            validation_passed: None,
205            score_delta: None,
206        },
207    );
208    let denied = pre_command.denied();
209    hook_decisions.push(pre_command);
210    if denied {
211        return CandidateExecutionOutcome::rejected(
212            SafetyRejectionKind::HookDenied,
213            " (pre-command hook denied validation)",
214            hook_decisions,
215        );
216    }
217
218    let wt_name = format!("opt-{}-{}", context.iteration, context.candidate_index);
219    let Some(validation_budget) =
220        remaining_budget(deadline_start, context.config.candidate_timeout)
221    else {
222        return timed_out_outcome(context.config.candidate_timeout, hook_decisions);
223    };
224    let validation_result = mdx_rust_analysis::editing::apply_and_validate_with_budget(
225        &agent.path,
226        edit,
227        &wt_name,
228        validation_budget,
229    );
230
231    let Ok(validation) = validation_result else {
232        if !context.config.quiet {
233            println!("     [Safe Apply] Validation in isolated workspace failed to run.");
234        }
235        return CandidateExecutionOutcome::rejected(
236            SafetyRejectionKind::ValidationFailed,
237            " (validation failed to run)",
238            hook_decisions,
239        );
240    };
241    if !validation.passed {
242        let validation_commands = validation.command_records;
243        let validation_timed_out = validation_commands.iter().any(|record| record.timed_out);
244        let decision = evaluate_builtin_hook(
245            context.config.hook_policy,
246            &HookContext {
247                stage: HookStage::PostValidation,
248                agent_name: agent.name.clone(),
249                edit_description: Some(edit.description.clone()),
250                patch_bytes: edit.patch.len(),
251                command: None,
252                validation_passed: Some(false),
253                score_delta: None,
254            },
255        );
256        hook_decisions.push(decision);
257        if !context.config.quiet {
258            println!("     [Safe Apply] Validation in isolated workspace failed.");
259        }
260        return CandidateExecutionOutcome {
261            validation_commands,
262            timed_out: validation_timed_out,
263            ..CandidateExecutionOutcome::rejected(
264                SafetyRejectionKind::ValidationFailed,
265                format!(
266                    " (validation rejected candidate: {})",
267                    validation
268                        .cargo_check_output
269                        .lines()
270                        .last()
271                        .unwrap_or("no output")
272                ),
273                hook_decisions,
274            )
275        };
276    }
277    let validation_commands = validation.command_records;
278    let validated_edit = IsolatedValidatedEdit {
279        scoped: scoped_edit,
280        validation_commands,
281    };
282    if deadline_start.elapsed() >= context.config.candidate_timeout {
283        let validation_commands = validated_edit.validation_commands;
284        return CandidateExecutionOutcome {
285            validated: 1,
286            validation_commands,
287            ..timed_out_outcome(context.config.candidate_timeout, hook_decisions)
288        };
289    }
290
291    let post_validation = evaluate_builtin_hook(
292        context.config.hook_policy,
293        &HookContext {
294            stage: HookStage::PostValidation,
295            agent_name: agent.name.clone(),
296            edit_description: Some(edit.description.clone()),
297            patch_bytes: edit.patch.len(),
298            command: None,
299            validation_passed: Some(true),
300            score_delta: None,
301        },
302    );
303    let denied = post_validation.denied();
304    hook_decisions.push(post_validation);
305    if denied {
306        let validation_commands = validated_edit.validation_commands;
307        return CandidateExecutionOutcome {
308            validated: 1,
309            validation_commands,
310            ..CandidateExecutionOutcome::rejected(
311                SafetyRejectionKind::HookDenied,
312                " (post-validation hook denied candidate)",
313                hook_decisions,
314            )
315        };
316    }
317
318    if !context.config.quiet {
319        println!(
320            "     [Safe Apply] Edit validated in isolated workspace (cargo check + clippy OK)."
321        );
322    }
323
324    let patched_score = {
325        let score_name = format!("score-{}-{}", context.iteration, context.candidate_index);
326        match mdx_rust_analysis::editing::create_isolated_workspace(&agent.path, &score_name) {
327            Ok(isolated) => {
328                let score = if mdx_rust_analysis::editing::apply_edit(&agent.path, &isolated, edit)
329                    .is_ok()
330                {
331                    evaluate_workspace(&isolated, context.test_inputs, context.scorer)
332                        .await
333                        .unwrap_or(context.baseline_score)
334                } else {
335                    context.baseline_score
336                };
337                mdx_rust_analysis::editing::cleanup_isolated_workspace(&agent.path, &isolated);
338                score
339            }
340            Err(_) => context.baseline_score,
341        }
342    };
343    if deadline_start.elapsed() >= context.config.candidate_timeout {
344        let validation_commands = validated_edit.validation_commands;
345        return CandidateExecutionOutcome {
346            validated: 1,
347            patched_score: Some(patched_score),
348            delta: Some(patched_score - context.baseline_score),
349            validation_commands,
350            ..timed_out_outcome(context.config.candidate_timeout, hook_decisions)
351        };
352    }
353
354    let delta = patched_score - context.baseline_score;
355    let pre_accept = evaluate_builtin_hook(
356        context.config.hook_policy,
357        &HookContext {
358            stage: HookStage::PreAccept,
359            agent_name: agent.name.clone(),
360            edit_description: Some(edit.description.clone()),
361            patch_bytes: edit.patch.len(),
362            command: None,
363            validation_passed: Some(true),
364            score_delta: Some(delta),
365        },
366    );
367    let denied = pre_accept.denied();
368    hook_decisions.push(pre_accept);
369    if denied {
370        let validation_commands = validated_edit.validation_commands;
371        return CandidateExecutionOutcome {
372            validated: 1,
373            patched_score: Some(patched_score),
374            delta: Some(delta),
375            validation_commands,
376            ..CandidateExecutionOutcome::rejected(
377                SafetyRejectionKind::HookDenied,
378                format!(" (pre-accept hook denied delta {delta:.2})"),
379                hook_decisions,
380            )
381        };
382    }
383
384    if delta <= 0.0 {
385        let validation_commands = validated_edit.validation_commands;
386        if !context.config.quiet {
387            println!(
388                "     [Net-Negative] Patched score {:.2} vs baseline {:.2} (delta {:.2}) - change rejected.",
389                patched_score, context.baseline_score, delta
390            );
391        }
392        return CandidateExecutionOutcome {
393            validated: 1,
394            patched_score: Some(patched_score),
395            delta: Some(delta),
396            validation_commands,
397            ..CandidateExecutionOutcome::rejected(
398                SafetyRejectionKind::NetNegative,
399                format!(
400                    " (net-negative {:.2}->{:.2})",
401                    context.baseline_score, patched_score
402                ),
403                hook_decisions,
404            )
405        };
406    }
407    let net_positive_edit = NetPositiveEdit {
408        validated: validated_edit,
409        patched_score,
410        delta,
411    };
412
413    if context.config.review_before_apply {
414        let validation_commands = net_positive_edit.validated.validation_commands;
415        if !context.config.quiet {
416            println!("     [Review] Change validated in isolation but not applied (--review).");
417        }
418        return CandidateExecutionOutcome {
419            validated: 1,
420            patched_score: Some(patched_score),
421            delta: Some(delta),
422            validation_commands,
423            ..CandidateExecutionOutcome::rejected(
424                SafetyRejectionKind::ReviewOnly,
425                " (review mode: validated in isolation, not applied)",
426                hook_decisions,
427            )
428        };
429    }
430
431    let edit = net_positive_edit.validated.scoped.edit;
432    let validation_commands = net_positive_edit.validated.validation_commands;
433    let patched_score = net_positive_edit.patched_score;
434    let delta = net_positive_edit.delta;
435
436    let snapshot = match mdx_rust_analysis::editing::snapshot_file(&edit.file) {
437        Ok(snapshot) => snapshot,
438        Err(err) => {
439            return CandidateExecutionOutcome {
440                validated: 1,
441                patched_score: Some(patched_score),
442                delta: Some(delta),
443                validation_commands,
444                ..CandidateExecutionOutcome::rejected(
445                    SafetyRejectionKind::SnapshotFailed,
446                    format!(" (snapshot failed: {err})"),
447                    hook_decisions,
448                )
449            };
450        }
451    };
452
453    if let Err(err) = mdx_rust_analysis::editing::apply_edit_to_agent(&agent.path, edit) {
454        if !context.config.quiet {
455            println!(
456                "     [Land Failed] Could not apply validated patch to real source: {}",
457                err
458            );
459        }
460        return CandidateExecutionOutcome {
461            validated: 1,
462            patched_score: Some(patched_score),
463            delta: Some(delta),
464            validation_commands,
465            ..CandidateExecutionOutcome::rejected(
466                SafetyRejectionKind::LandingFailed,
467                " (landing failed)",
468                hook_decisions,
469            )
470        };
471    }
472
473    let final_budget = remaining_budget(deadline_start, context.config.candidate_timeout)
474        .unwrap_or_else(|| Duration::from_secs(0));
475    let final_report =
476        mdx_rust_analysis::editing::validate_build_detailed_with_budget(&agent.path, final_budget);
477    let final_ok = final_report.passed;
478    let final_validation_commands = final_report.command_records;
479    let final_validation_timed_out = final_validation_commands
480        .iter()
481        .any(|record| record.timed_out);
482    if deadline_start.elapsed() >= context.config.candidate_timeout || final_validation_timed_out {
483        let rollback_result = mdx_rust_analysis::editing::restore_file(&snapshot);
484        let rollback_error = rollback_result.as_ref().err().map(ToString::to_string);
485        let rollback_succeeded = rollback_result.is_ok();
486        return CandidateExecutionOutcome {
487            validated: 1,
488            landed: 0,
489            accepted: 0,
490            accepted_diff: None,
491            patched_score: Some(patched_score),
492            holdout_score: None,
493            delta: Some(delta),
494            note: format!(
495                " (candidate timed out after {}s and was rolled back)",
496                context.config.candidate_timeout.as_secs()
497            ),
498            hook_decisions,
499            validation_commands,
500            final_validation_commands,
501            rollback_succeeded: Some(rollback_succeeded),
502            rollback_error,
503            timed_out: true,
504            rejection: Some(SafetyRejection {
505                kind: SafetyRejectionKind::Timeout,
506                message: format!(
507                    "candidate timed out after {}s and was rolled back",
508                    context.config.candidate_timeout.as_secs()
509                ),
510            }),
511        };
512    }
513
514    if final_ok {
515        let holdout_score = if context.holdout_inputs.is_empty() {
516            None
517        } else {
518            evaluate_workspace(&agent.path, context.holdout_inputs, context.scorer)
519                .await
520                .ok()
521        };
522
523        if !context.config.quiet {
524            println!(
525                "     [Accepted] Landed + final validation OK (score {:.2} -> {:.2}, delta {:.2}).",
526                context.baseline_score, patched_score, delta
527            );
528        }
529
530        CandidateExecutionOutcome {
531            validated: 1,
532            landed: 1,
533            accepted: 1,
534            accepted_diff: Some(edit.patch.clone()),
535            patched_score: Some(patched_score),
536            holdout_score,
537            delta: Some(delta),
538            note: format!(" (accepted +{delta:.2})"),
539            hook_decisions,
540            validation_commands,
541            final_validation_commands,
542            rollback_succeeded: None,
543            rollback_error: None,
544            timed_out: false,
545            rejection: None,
546        }
547    } else {
548        let rollback_result = mdx_rust_analysis::editing::restore_file(&snapshot);
549        let rollback_error = rollback_result.as_ref().err().map(ToString::to_string);
550        let rollback_succeeded = rollback_result.is_ok();
551        let _ = mdx_rust_analysis::editing::validate_build(&agent.path);
552        if !context.config.quiet {
553            println!(
554                "     [Final Validation Failed] Change rolled back after re-validation failed."
555            );
556        }
557        CandidateExecutionOutcome {
558            validated: 1,
559            landed: 0,
560            accepted: 0,
561            accepted_diff: None,
562            patched_score: Some(patched_score),
563            holdout_score: None,
564            delta: Some(delta),
565            note: " (final validation failed and rolled back)".to_string(),
566            hook_decisions,
567            validation_commands,
568            final_validation_commands,
569            rollback_succeeded: Some(rollback_succeeded),
570            rollback_error,
571            timed_out: false,
572            rejection: Some(SafetyRejection {
573                kind: SafetyRejectionKind::FinalValidationFailed,
574                message: "final validation failed and rolled back".to_string(),
575            }),
576        }
577    }
578}
579
580fn timed_out_outcome(
581    timeout: Duration,
582    hook_decisions: Vec<HookDecision>,
583) -> CandidateExecutionOutcome {
584    CandidateExecutionOutcome {
585        timed_out: true,
586        ..CandidateExecutionOutcome::rejected(
587            SafetyRejectionKind::Timeout,
588            format!(" (candidate timed out after {}s)", timeout.as_secs()),
589            hook_decisions,
590        )
591    }
592}
593
594fn remaining_budget(start: Instant, total: Duration) -> Option<Duration> {
595    total
596        .checked_sub(start.elapsed())
597        .filter(|remaining| !remaining.is_zero())
598}
599
600fn ensure_single_file_patch_scope(agent_root: &Path, edit: &ProposedEdit) -> anyhow::Result<()> {
601    let expected = if edit.file.is_absolute() {
602        edit.file.strip_prefix(agent_root).map_err(|_| {
603            anyhow::anyhow!("edit file is outside agent root: {}", edit.file.display())
604        })?
605    } else {
606        edit.file.as_path()
607    };
608
609    for line in edit.patch.lines() {
610        for path in diff_paths_from_line(line) {
611            if path == "/dev/null" {
612                continue;
613            }
614
615            if Path::new(&path) != expected {
616                anyhow::bail!(
617                    "patch touches {}, but ProposedEdit.file is {}",
618                    path,
619                    expected.display()
620                );
621            }
622        }
623    }
624
625    Ok(())
626}
627
628fn diff_paths_from_line(line: &str) -> Vec<String> {
629    if let Some(path) = line
630        .strip_prefix("+++ ")
631        .or_else(|| line.strip_prefix("--- "))
632    {
633        return normalize_diff_path(path).into_iter().collect();
634    }
635
636    if let Some(rest) = line.strip_prefix("diff --git ") {
637        return rest
638            .split_whitespace()
639            .filter_map(normalize_diff_path)
640            .collect();
641    }
642
643    for prefix in ["rename from ", "rename to ", "copy from ", "copy to "] {
644        if let Some(path) = line.strip_prefix(prefix) {
645            return normalize_diff_path(path).into_iter().collect();
646        }
647    }
648
649    if let Some(rest) = line.strip_prefix("Binary files ") {
650        if let Some((left, right_with_suffix)) = rest.split_once(" and ") {
651            let right = right_with_suffix
652                .strip_suffix(" differ")
653                .unwrap_or(right_with_suffix);
654            return [left, right]
655                .into_iter()
656                .filter_map(normalize_diff_path)
657                .collect();
658        }
659    }
660
661    Vec::new()
662}
663
664fn normalize_diff_path(raw: &str) -> Option<String> {
665    let path = raw.trim().trim_matches('"');
666    if path == "/dev/null" {
667        return Some(path.to_string());
668    }
669
670    path.strip_prefix("a/")
671        .or_else(|| path.strip_prefix("b/"))
672        .or(Some(path))
673        .map(str::to_string)
674}
675
676async fn evaluate_workspace(
677    dir: &std::path::Path,
678    inputs: &[serde_json::Value],
679    scorer: fn(&AgentRunResult) -> f32,
680) -> anyhow::Result<f32> {
681    let temp_agent = RegisteredAgent {
682        name: "isolated-eval".to_string(),
683        path: dir.to_path_buf(),
684        contract: AgentContract::Process,
685        registered_at: "".to_string(),
686    };
687
688    let mut scores = vec![];
689    for input in inputs {
690        let res = crate::runner::run_agent(&temp_agent, input.clone()).await?;
691        scores.push(scorer(&res));
692    }
693    if scores.is_empty() {
694        return Ok(0.0);
695    }
696    Ok(scores.iter().sum::<f32>() / scores.len() as f32)
697}
698
699#[cfg(test)]
700mod tests {
701    use super::*;
702    use crate::optimizer::mechanical_score;
703    use proptest::prelude::*;
704    use tempfile::tempdir;
705
706    fn temp_agent_source(answer_suffix: &str) -> String {
707        r#"use std::io::BufRead;
708
709fn main() {
710    let mut input = String::new();
711    std::io::stdin().lock().read_line(&mut input).unwrap();
712    println!("{{\"answer\":\"A stable useful answer __SUFFIX__\",\"confidence\":0.70,\"reasoning\":\"Think step by step.\"}}");
713}
714"#
715        .replace("__SUFFIX__", answer_suffix)
716    }
717
718    fn write_temp_agent(with_final_failure_marker: bool) -> (tempfile::TempDir, RegisteredAgent) {
719        let dir = tempdir().unwrap();
720        std::fs::create_dir_all(dir.path().join("src")).unwrap();
721        std::fs::write(
722            dir.path().join("Cargo.toml"),
723            "[package]\nname=\"safety-agent\"\nversion=\"0.1.0\"\nedition=\"2021\"\n",
724        )
725        .unwrap();
726        std::fs::write(dir.path().join("src/main.rs"), temp_agent_source("before")).unwrap();
727
728        if with_final_failure_marker {
729            std::fs::write(
730                dir.path().join("build.rs"),
731                r#"
732fn main() {
733    if std::path::Path::new(".mdx-rust/fail-final").exists() {
734        panic!("intentional final validation failure");
735    }
736}
737"#,
738            )
739            .unwrap();
740            std::fs::create_dir_all(dir.path().join(".mdx-rust")).unwrap();
741            std::fs::write(dir.path().join(".mdx-rust/fail-final"), "1").unwrap();
742        }
743
744        let agent = RegisteredAgent {
745            name: "safety-agent".to_string(),
746            path: dir.path().to_path_buf(),
747            contract: AgentContract::Process,
748            registered_at: "test".to_string(),
749        };
750
751        (dir, agent)
752    }
753
754    fn comment_patch() -> String {
755        "diff --git a/src/main.rs b/src/main.rs\n--- a/src/main.rs\n+++ b/src/main.rs\n@@ -1,5 +1,6 @@\n use std::io::BufRead;\n+// mdx safety invariant test\n \n fn main() {\n     let mut input = String::new();\n     std::io::stdin().lock().read_line(&mut input).unwrap();\n"
756            .to_string()
757    }
758
759    fn improved_patch() -> String {
760        "diff --git a/src/main.rs b/src/main.rs\n--- a/src/main.rs\n+++ b/src/main.rs\n@@ -2,6 +2,6 @@ use std::io::BufRead;\n \n fn main() {\n     let mut input = String::new();\n     std::io::stdin().lock().read_line(&mut input).unwrap();\n-    println!(\"{{\\\"answer\\\":\\\"A stable useful answer before\\\",\\\"confidence\\\":0.70,\\\"reasoning\\\":\\\"Think step by step.\\\"}}\");\n+    println!(\"{{\\\"answer\\\":\\\"A stable useful answer after with much more useful detail\\\",\\\"confidence\\\":0.70,\\\"reasoning\\\":\\\"Think step by step.\\\"}}\");\n }\n"
761            .to_string()
762    }
763
764    fn execution_config<'a>(policy: &'a HookPolicy) -> CandidateExecutionConfig<'a> {
765        CandidateExecutionConfig {
766            hook_policy: policy,
767            review_before_apply: false,
768            quiet: true,
769            candidate_timeout: Duration::from_secs(30),
770        }
771    }
772
773    #[tokio::test]
774    async fn deny_hook_cannot_accept_or_validate() {
775        let (_dir, agent) = write_temp_agent(false);
776        let policy = HookPolicy {
777            max_patch_bytes: 1,
778            require_positive_delta: true,
779        };
780        let edit = ProposedEdit {
781            file: agent.path.join("src/main.rs"),
782            description: "too large".to_string(),
783            patch: comment_patch(),
784        };
785
786        let outcome = execute_candidate_edit(CandidateExecutionContext {
787            agent: &agent,
788            config: execution_config(&policy),
789            iteration: 0,
790            candidate_index: 0,
791            edit: &edit,
792            test_inputs: &[serde_json::json!({"query":"hi"})],
793            holdout_inputs: &[],
794            baseline_score: 0.0,
795            scorer: mechanical_score,
796        })
797        .await;
798
799        assert_eq!(outcome.validated, 0);
800        assert_eq!(outcome.landed, 0);
801        assert_eq!(outcome.accepted, 0);
802        assert!(outcome
803            .hook_decisions
804            .iter()
805            .any(|decision| decision.denied()));
806    }
807
808    #[tokio::test]
809    async fn net_negative_candidate_is_rejected_before_landing() {
810        let (_dir, agent) = write_temp_agent(false);
811        let before = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
812        let policy = HookPolicy::default();
813        let edit = ProposedEdit {
814            file: agent.path.join("src/main.rs"),
815            description: "comment only".to_string(),
816            patch: comment_patch(),
817        };
818
819        let outcome = execute_candidate_edit(CandidateExecutionContext {
820            agent: &agent,
821            config: execution_config(&policy),
822            iteration: 0,
823            candidate_index: 0,
824            edit: &edit,
825            test_inputs: &[serde_json::json!({"query":"hi"})],
826            holdout_inputs: &[],
827            baseline_score: 0.95,
828            scorer: mechanical_score,
829        })
830        .await;
831
832        let after = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
833        assert!(
834            outcome.note.is_empty() || !outcome.note.contains("validation rejected"),
835            "{}",
836            outcome.note
837        );
838        assert_eq!(outcome.validated, 1, "{}", outcome.note);
839        assert_eq!(outcome.landed, 0);
840        assert_eq!(outcome.accepted, 0);
841        assert_eq!(before, after);
842    }
843
844    #[tokio::test]
845    async fn final_validation_failure_rolls_back_and_does_not_accept() {
846        let (_dir, agent) = write_temp_agent(true);
847        let before = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
848        let policy = HookPolicy::default();
849        let edit = ProposedEdit {
850            file: agent.path.join("src/main.rs"),
851            description: "improve answer".to_string(),
852            patch: improved_patch(),
853        };
854
855        let outcome = execute_candidate_edit(CandidateExecutionContext {
856            agent: &agent,
857            config: execution_config(&policy),
858            iteration: 0,
859            candidate_index: 0,
860            edit: &edit,
861            test_inputs: &[serde_json::json!({"query":"hi"})],
862            holdout_inputs: &[],
863            baseline_score: 0.40,
864            scorer: mechanical_score,
865        })
866        .await;
867
868        let after = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
869        assert!(
870            outcome.note.is_empty() || !outcome.note.contains("validation rejected"),
871            "{}",
872            outcome.note
873        );
874        assert_eq!(outcome.validated, 1, "{}", outcome.note);
875        assert_eq!(outcome.landed, 0);
876        assert_eq!(outcome.accepted, 0);
877        assert_eq!(before, after);
878    }
879
880    #[tokio::test]
881    async fn patch_scope_mismatch_is_rejected_before_validation() {
882        let (_dir, agent) = write_temp_agent(false);
883        let policy = HookPolicy::default();
884        let before = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
885        let edit = ProposedEdit {
886            file: agent.path.join("src/main.rs"),
887            description: "bad multi-file patch".to_string(),
888            patch: "diff --git a/src/lib.rs b/src/lib.rs\n--- a/src/lib.rs\n+++ b/src/lib.rs\n@@ -1,1 +1,1 @@\n-a\n+b\n".to_string(),
889        };
890
891        let outcome = execute_candidate_edit(CandidateExecutionContext {
892            agent: &agent,
893            config: execution_config(&policy),
894            iteration: 0,
895            candidate_index: 0,
896            edit: &edit,
897            test_inputs: &[serde_json::json!({"query":"hi"})],
898            holdout_inputs: &[],
899            baseline_score: 0.40,
900            scorer: mechanical_score,
901        })
902        .await;
903
904        assert_eq!(outcome.validated, 0);
905        assert_eq!(outcome.landed, 0);
906        assert_eq!(outcome.accepted, 0);
907        assert!(outcome.note.contains("edit scope rejected"));
908        assert_eq!(
909            std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap(),
910            before
911        );
912    }
913
914    #[tokio::test]
915    async fn diff_git_scope_mismatch_is_rejected_before_validation() {
916        let (_dir, agent) = write_temp_agent(false);
917        let policy = HookPolicy::default();
918        let edit = ProposedEdit {
919            file: agent.path.join("src/main.rs"),
920            description: "bad diff header".to_string(),
921            patch: "diff --git a/src/main.rs b/src/lib.rs\n--- a/src/main.rs\n+++ b/src/lib.rs\n@@ -1,1 +1,1 @@\n-a\n+b\n".to_string(),
922        };
923
924        let outcome = execute_candidate_edit(CandidateExecutionContext {
925            agent: &agent,
926            config: execution_config(&policy),
927            iteration: 0,
928            candidate_index: 0,
929            edit: &edit,
930            test_inputs: &[serde_json::json!({"query":"hi"})],
931            holdout_inputs: &[],
932            baseline_score: 0.40,
933            scorer: mechanical_score,
934        })
935        .await;
936
937        assert_eq!(outcome.validated, 0);
938        assert_eq!(outcome.landed, 0);
939        assert_eq!(outcome.accepted, 0);
940        assert!(outcome.note.contains("edit scope rejected"));
941    }
942
943    #[tokio::test]
944    async fn rename_scope_mismatch_is_rejected_before_validation() {
945        let (_dir, agent) = write_temp_agent(false);
946        let policy = HookPolicy::default();
947        let edit = ProposedEdit {
948            file: agent.path.join("src/main.rs"),
949            description: "bad rename".to_string(),
950            patch: "diff --git a/src/main.rs b/src/lib.rs\nsimilarity index 100%\nrename from src/main.rs\nrename to src/lib.rs\n".to_string(),
951        };
952
953        let outcome = execute_candidate_edit(CandidateExecutionContext {
954            agent: &agent,
955            config: execution_config(&policy),
956            iteration: 0,
957            candidate_index: 0,
958            edit: &edit,
959            test_inputs: &[serde_json::json!({"query":"hi"})],
960            holdout_inputs: &[],
961            baseline_score: 0.40,
962            scorer: mechanical_score,
963        })
964        .await;
965
966        assert_eq!(outcome.validated, 0);
967        assert_eq!(outcome.landed, 0);
968        assert_eq!(outcome.accepted, 0);
969        assert!(outcome.note.contains("edit scope rejected"));
970    }
971
972    #[tokio::test]
973    async fn exhausted_candidate_timeout_stops_before_validation() {
974        let (_dir, agent) = write_temp_agent(false);
975        let policy = HookPolicy::default();
976        let edit = ProposedEdit {
977            file: agent.path.join("src/main.rs"),
978            description: "comment only".to_string(),
979            patch: comment_patch(),
980        };
981        let config = CandidateExecutionConfig {
982            hook_policy: &policy,
983            review_before_apply: false,
984            quiet: true,
985            candidate_timeout: Duration::from_secs(0),
986        };
987
988        let outcome = execute_candidate_edit(CandidateExecutionContext {
989            agent: &agent,
990            config,
991            iteration: 0,
992            candidate_index: 0,
993            edit: &edit,
994            test_inputs: &[serde_json::json!({"query":"hi"})],
995            holdout_inputs: &[],
996            baseline_score: 0.40,
997            scorer: mechanical_score,
998        })
999        .await;
1000
1001        assert!(outcome.timed_out);
1002        assert_eq!(outcome.validated, 0);
1003        assert_eq!(outcome.landed, 0);
1004        assert_eq!(outcome.accepted, 0);
1005        assert_eq!(
1006            outcome.rejection.as_ref().map(|rejection| &rejection.kind),
1007            Some(&SafetyRejectionKind::Timeout)
1008        );
1009    }
1010
1011    proptest! {
1012        #[test]
1013        fn normalized_diff_paths_remove_only_diff_side_prefixes(path in "[a-zA-Z0-9_./-]{1,64}") {
1014            let line = format!("diff --git a/{path} b/{path}");
1015            let paths = diff_paths_from_line(&line);
1016
1017            prop_assert_eq!(paths, vec![path.clone(), path]);
1018        }
1019
1020        #[test]
1021        fn pre_accept_policy_denies_all_non_positive_deltas(delta in -10.0f32..=0.0f32) {
1022            let decision = evaluate_builtin_hook(
1023                &HookPolicy::default(),
1024                &HookContext {
1025                    stage: HookStage::PreAccept,
1026                    agent_name: "agent".to_string(),
1027                    edit_description: None,
1028                    patch_bytes: 0,
1029                    command: None,
1030                    validation_passed: Some(true),
1031                    score_delta: Some(delta),
1032                },
1033            );
1034
1035            prop_assert!(decision.denied());
1036        }
1037    }
1038}