1use crate::hooks::{evaluate_builtin_hook, HookContext, HookDecision, HookPolicy, HookStage};
8use crate::registry::{AgentContract, RegisteredAgent};
9use crate::runner::AgentRunResult;
10use mdx_rust_analysis::editing::{ProposedEdit, ValidationCommandRecord};
11use schemars::JsonSchema;
12use serde::{Deserialize, Serialize};
13use std::path::Path;
14use std::time::{Duration, Instant};
15
16#[derive(Debug, Clone, Copy)]
17pub struct CandidateExecutionConfig<'a> {
18 pub hook_policy: &'a HookPolicy,
19 pub review_before_apply: bool,
20 pub quiet: bool,
21 pub candidate_timeout: Duration,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, thiserror::Error, PartialEq, Eq)]
25pub enum SafetyRejectionKind {
26 #[error("edit scope rejected")]
27 EditScope,
28 #[error("hook denied candidate")]
29 HookDenied,
30 #[error("validation failed")]
31 ValidationFailed,
32 #[error("candidate was not net positive")]
33 NetNegative,
34 #[error("review mode prevented landing")]
35 ReviewOnly,
36 #[error("snapshot failed")]
37 SnapshotFailed,
38 #[error("landing failed")]
39 LandingFailed,
40 #[error("final validation failed")]
41 FinalValidationFailed,
42 #[error("candidate timed out")]
43 Timeout,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
47pub struct SafetyRejection {
48 pub kind: SafetyRejectionKind,
49 pub message: String,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
53pub struct CandidateExecutionOutcome {
54 pub validated: u32,
55 pub landed: u32,
56 pub accepted: u32,
57 pub accepted_diff: Option<String>,
58 pub patched_score: Option<f32>,
59 pub holdout_score: Option<f32>,
60 pub delta: Option<f32>,
61 pub note: String,
62 pub hook_decisions: Vec<HookDecision>,
63 pub validation_commands: Vec<ValidationCommandRecord>,
64 pub final_validation_commands: Vec<ValidationCommandRecord>,
65 pub rollback_succeeded: Option<bool>,
66 pub rollback_error: Option<String>,
67 pub timed_out: bool,
68 #[serde(default)]
69 pub rejection: Option<SafetyRejection>,
70}
71
72impl CandidateExecutionOutcome {
73 fn empty(note: impl Into<String>, hook_decisions: Vec<HookDecision>) -> Self {
74 Self {
75 validated: 0,
76 landed: 0,
77 accepted: 0,
78 accepted_diff: None,
79 patched_score: None,
80 holdout_score: None,
81 delta: None,
82 note: note.into(),
83 hook_decisions,
84 validation_commands: Vec::new(),
85 final_validation_commands: Vec::new(),
86 rollback_succeeded: None,
87 rollback_error: None,
88 timed_out: false,
89 rejection: None,
90 }
91 }
92
93 fn rejected(
94 kind: SafetyRejectionKind,
95 message: impl Into<String>,
96 hook_decisions: Vec<HookDecision>,
97 ) -> Self {
98 let message = message.into();
99 Self {
100 rejection: Some(SafetyRejection {
101 kind,
102 message: message.clone(),
103 }),
104 ..Self::empty(message, hook_decisions)
105 }
106 }
107}
108
109struct ScopedEdit<'a> {
110 edit: &'a ProposedEdit,
111}
112
113struct IsolatedValidatedEdit<'a> {
114 scoped: ScopedEdit<'a>,
115 validation_commands: Vec<ValidationCommandRecord>,
116}
117
118struct NetPositiveEdit<'a> {
119 validated: IsolatedValidatedEdit<'a>,
120 patched_score: f32,
121 delta: f32,
122}
123
124pub struct CandidateExecutionContext<'a> {
125 pub agent: &'a RegisteredAgent,
126 pub config: CandidateExecutionConfig<'a>,
127 pub iteration: u32,
128 pub candidate_index: usize,
129 pub edit: &'a ProposedEdit,
130 pub test_inputs: &'a [serde_json::Value],
131 pub holdout_inputs: &'a [serde_json::Value],
132 pub baseline_score: f32,
133 pub scorer: fn(&AgentRunResult) -> f32,
134}
135
136pub async fn execute_candidate_edit(
137 context: CandidateExecutionContext<'_>,
138) -> CandidateExecutionOutcome {
139 let timeout = context.config.candidate_timeout;
140 match tokio::time::timeout(timeout, execute_candidate_edit_inner(context)).await {
141 Ok(outcome) => outcome,
142 Err(_) => CandidateExecutionOutcome {
143 timed_out: true,
144 ..CandidateExecutionOutcome::rejected(
145 SafetyRejectionKind::Timeout,
146 format!(" (candidate timed out after {}s)", timeout.as_secs()),
147 Vec::new(),
148 )
149 },
150 }
151}
152
153async fn execute_candidate_edit_inner(
154 context: CandidateExecutionContext<'_>,
155) -> CandidateExecutionOutcome {
156 let agent = context.agent;
157 let edit = context.edit;
158 let mut hook_decisions = Vec::new();
159 let deadline_start = Instant::now();
160
161 if let Err(err) = ensure_single_file_patch_scope(&agent.path, edit) {
162 return CandidateExecutionOutcome::rejected(
163 SafetyRejectionKind::EditScope,
164 format!(" (edit scope rejected: {err})"),
165 hook_decisions,
166 );
167 }
168 let scoped_edit = ScopedEdit { edit };
169
170 if deadline_start.elapsed() >= context.config.candidate_timeout {
171 return timed_out_outcome(context.config.candidate_timeout, hook_decisions);
172 }
173
174 let pre_edit = evaluate_builtin_hook(
175 context.config.hook_policy,
176 &HookContext {
177 stage: HookStage::PreEdit,
178 agent_name: agent.name.clone(),
179 edit_description: Some(edit.description.clone()),
180 patch_bytes: edit.patch.len(),
181 command: None,
182 validation_passed: None,
183 score_delta: None,
184 },
185 );
186 let denied = pre_edit.denied();
187 hook_decisions.push(pre_edit);
188 if denied {
189 return CandidateExecutionOutcome::rejected(
190 SafetyRejectionKind::HookDenied,
191 " (pre-edit hook denied candidate)",
192 hook_decisions,
193 );
194 }
195
196 let pre_command = evaluate_builtin_hook(
197 context.config.hook_policy,
198 &HookContext {
199 stage: HookStage::PreCommand,
200 agent_name: agent.name.clone(),
201 edit_description: Some(edit.description.clone()),
202 patch_bytes: edit.patch.len(),
203 command: Some("cargo check && cargo clippy -- -D warnings".to_string()),
204 validation_passed: None,
205 score_delta: None,
206 },
207 );
208 let denied = pre_command.denied();
209 hook_decisions.push(pre_command);
210 if denied {
211 return CandidateExecutionOutcome::rejected(
212 SafetyRejectionKind::HookDenied,
213 " (pre-command hook denied validation)",
214 hook_decisions,
215 );
216 }
217
218 let wt_name = format!("opt-{}-{}", context.iteration, context.candidate_index);
219 let Some(validation_budget) =
220 remaining_budget(deadline_start, context.config.candidate_timeout)
221 else {
222 return timed_out_outcome(context.config.candidate_timeout, hook_decisions);
223 };
224 let validation_result = mdx_rust_analysis::editing::apply_and_validate_with_budget(
225 &agent.path,
226 edit,
227 &wt_name,
228 validation_budget,
229 );
230
231 let Ok(validation) = validation_result else {
232 if !context.config.quiet {
233 println!(" [Safe Apply] Validation in isolated workspace failed to run.");
234 }
235 return CandidateExecutionOutcome::rejected(
236 SafetyRejectionKind::ValidationFailed,
237 " (validation failed to run)",
238 hook_decisions,
239 );
240 };
241 if !validation.passed {
242 let validation_commands = validation.command_records;
243 let validation_timed_out = validation_commands.iter().any(|record| record.timed_out);
244 let decision = evaluate_builtin_hook(
245 context.config.hook_policy,
246 &HookContext {
247 stage: HookStage::PostValidation,
248 agent_name: agent.name.clone(),
249 edit_description: Some(edit.description.clone()),
250 patch_bytes: edit.patch.len(),
251 command: None,
252 validation_passed: Some(false),
253 score_delta: None,
254 },
255 );
256 hook_decisions.push(decision);
257 if !context.config.quiet {
258 println!(" [Safe Apply] Validation in isolated workspace failed.");
259 }
260 return CandidateExecutionOutcome {
261 validation_commands,
262 timed_out: validation_timed_out,
263 ..CandidateExecutionOutcome::rejected(
264 SafetyRejectionKind::ValidationFailed,
265 format!(
266 " (validation rejected candidate: {})",
267 validation
268 .cargo_check_output
269 .lines()
270 .last()
271 .unwrap_or("no output")
272 ),
273 hook_decisions,
274 )
275 };
276 }
277 let validation_commands = validation.command_records;
278 let validated_edit = IsolatedValidatedEdit {
279 scoped: scoped_edit,
280 validation_commands,
281 };
282 if deadline_start.elapsed() >= context.config.candidate_timeout {
283 let validation_commands = validated_edit.validation_commands;
284 return CandidateExecutionOutcome {
285 validated: 1,
286 validation_commands,
287 ..timed_out_outcome(context.config.candidate_timeout, hook_decisions)
288 };
289 }
290
291 let post_validation = evaluate_builtin_hook(
292 context.config.hook_policy,
293 &HookContext {
294 stage: HookStage::PostValidation,
295 agent_name: agent.name.clone(),
296 edit_description: Some(edit.description.clone()),
297 patch_bytes: edit.patch.len(),
298 command: None,
299 validation_passed: Some(true),
300 score_delta: None,
301 },
302 );
303 let denied = post_validation.denied();
304 hook_decisions.push(post_validation);
305 if denied {
306 let validation_commands = validated_edit.validation_commands;
307 return CandidateExecutionOutcome {
308 validated: 1,
309 validation_commands,
310 ..CandidateExecutionOutcome::rejected(
311 SafetyRejectionKind::HookDenied,
312 " (post-validation hook denied candidate)",
313 hook_decisions,
314 )
315 };
316 }
317
318 if !context.config.quiet {
319 println!(
320 " [Safe Apply] Edit validated in isolated workspace (cargo check + clippy OK)."
321 );
322 }
323
324 let patched_score = {
325 let score_name = format!("score-{}-{}", context.iteration, context.candidate_index);
326 match mdx_rust_analysis::editing::create_isolated_workspace(&agent.path, &score_name) {
327 Ok(isolated) => {
328 let score = if mdx_rust_analysis::editing::apply_edit(&agent.path, &isolated, edit)
329 .is_ok()
330 {
331 evaluate_workspace(&isolated, context.test_inputs, context.scorer)
332 .await
333 .unwrap_or(context.baseline_score)
334 } else {
335 context.baseline_score
336 };
337 mdx_rust_analysis::editing::cleanup_isolated_workspace(&agent.path, &isolated);
338 score
339 }
340 Err(_) => context.baseline_score,
341 }
342 };
343 if deadline_start.elapsed() >= context.config.candidate_timeout {
344 let validation_commands = validated_edit.validation_commands;
345 return CandidateExecutionOutcome {
346 validated: 1,
347 patched_score: Some(patched_score),
348 delta: Some(patched_score - context.baseline_score),
349 validation_commands,
350 ..timed_out_outcome(context.config.candidate_timeout, hook_decisions)
351 };
352 }
353
354 let delta = patched_score - context.baseline_score;
355 let pre_accept = evaluate_builtin_hook(
356 context.config.hook_policy,
357 &HookContext {
358 stage: HookStage::PreAccept,
359 agent_name: agent.name.clone(),
360 edit_description: Some(edit.description.clone()),
361 patch_bytes: edit.patch.len(),
362 command: None,
363 validation_passed: Some(true),
364 score_delta: Some(delta),
365 },
366 );
367 let denied = pre_accept.denied();
368 hook_decisions.push(pre_accept);
369 if denied {
370 let validation_commands = validated_edit.validation_commands;
371 return CandidateExecutionOutcome {
372 validated: 1,
373 patched_score: Some(patched_score),
374 delta: Some(delta),
375 validation_commands,
376 ..CandidateExecutionOutcome::rejected(
377 SafetyRejectionKind::HookDenied,
378 format!(" (pre-accept hook denied delta {delta:.2})"),
379 hook_decisions,
380 )
381 };
382 }
383
384 if delta <= 0.0 {
385 let validation_commands = validated_edit.validation_commands;
386 if !context.config.quiet {
387 println!(
388 " [Net-Negative] Patched score {:.2} vs baseline {:.2} (delta {:.2}) - change rejected.",
389 patched_score, context.baseline_score, delta
390 );
391 }
392 return CandidateExecutionOutcome {
393 validated: 1,
394 patched_score: Some(patched_score),
395 delta: Some(delta),
396 validation_commands,
397 ..CandidateExecutionOutcome::rejected(
398 SafetyRejectionKind::NetNegative,
399 format!(
400 " (net-negative {:.2}->{:.2})",
401 context.baseline_score, patched_score
402 ),
403 hook_decisions,
404 )
405 };
406 }
407 let net_positive_edit = NetPositiveEdit {
408 validated: validated_edit,
409 patched_score,
410 delta,
411 };
412
413 if context.config.review_before_apply {
414 let validation_commands = net_positive_edit.validated.validation_commands;
415 if !context.config.quiet {
416 println!(" [Review] Change validated in isolation but not applied (--review).");
417 }
418 return CandidateExecutionOutcome {
419 validated: 1,
420 patched_score: Some(patched_score),
421 delta: Some(delta),
422 validation_commands,
423 ..CandidateExecutionOutcome::rejected(
424 SafetyRejectionKind::ReviewOnly,
425 " (review mode: validated in isolation, not applied)",
426 hook_decisions,
427 )
428 };
429 }
430
431 let edit = net_positive_edit.validated.scoped.edit;
432 let validation_commands = net_positive_edit.validated.validation_commands;
433 let patched_score = net_positive_edit.patched_score;
434 let delta = net_positive_edit.delta;
435
436 let snapshot = match mdx_rust_analysis::editing::snapshot_file(&edit.file) {
437 Ok(snapshot) => snapshot,
438 Err(err) => {
439 return CandidateExecutionOutcome {
440 validated: 1,
441 patched_score: Some(patched_score),
442 delta: Some(delta),
443 validation_commands,
444 ..CandidateExecutionOutcome::rejected(
445 SafetyRejectionKind::SnapshotFailed,
446 format!(" (snapshot failed: {err})"),
447 hook_decisions,
448 )
449 };
450 }
451 };
452
453 if let Err(err) = mdx_rust_analysis::editing::apply_edit_to_agent(&agent.path, edit) {
454 if !context.config.quiet {
455 println!(
456 " [Land Failed] Could not apply validated patch to real source: {}",
457 err
458 );
459 }
460 return CandidateExecutionOutcome {
461 validated: 1,
462 patched_score: Some(patched_score),
463 delta: Some(delta),
464 validation_commands,
465 ..CandidateExecutionOutcome::rejected(
466 SafetyRejectionKind::LandingFailed,
467 " (landing failed)",
468 hook_decisions,
469 )
470 };
471 }
472
473 let final_budget = remaining_budget(deadline_start, context.config.candidate_timeout)
474 .unwrap_or_else(|| Duration::from_secs(0));
475 let final_report =
476 mdx_rust_analysis::editing::validate_build_detailed_with_budget(&agent.path, final_budget);
477 let final_ok = final_report.passed;
478 let final_validation_commands = final_report.command_records;
479 let final_validation_timed_out = final_validation_commands
480 .iter()
481 .any(|record| record.timed_out);
482 if deadline_start.elapsed() >= context.config.candidate_timeout || final_validation_timed_out {
483 let rollback_result = mdx_rust_analysis::editing::restore_file(&snapshot);
484 let rollback_error = rollback_result.as_ref().err().map(ToString::to_string);
485 let rollback_succeeded = rollback_result.is_ok();
486 return CandidateExecutionOutcome {
487 validated: 1,
488 landed: 0,
489 accepted: 0,
490 accepted_diff: None,
491 patched_score: Some(patched_score),
492 holdout_score: None,
493 delta: Some(delta),
494 note: format!(
495 " (candidate timed out after {}s and was rolled back)",
496 context.config.candidate_timeout.as_secs()
497 ),
498 hook_decisions,
499 validation_commands,
500 final_validation_commands,
501 rollback_succeeded: Some(rollback_succeeded),
502 rollback_error,
503 timed_out: true,
504 rejection: Some(SafetyRejection {
505 kind: SafetyRejectionKind::Timeout,
506 message: format!(
507 "candidate timed out after {}s and was rolled back",
508 context.config.candidate_timeout.as_secs()
509 ),
510 }),
511 };
512 }
513
514 if final_ok {
515 let holdout_score = if context.holdout_inputs.is_empty() {
516 None
517 } else {
518 evaluate_workspace(&agent.path, context.holdout_inputs, context.scorer)
519 .await
520 .ok()
521 };
522
523 if !context.config.quiet {
524 println!(
525 " [Accepted] Landed + final validation OK (score {:.2} -> {:.2}, delta {:.2}).",
526 context.baseline_score, patched_score, delta
527 );
528 }
529
530 CandidateExecutionOutcome {
531 validated: 1,
532 landed: 1,
533 accepted: 1,
534 accepted_diff: Some(edit.patch.clone()),
535 patched_score: Some(patched_score),
536 holdout_score,
537 delta: Some(delta),
538 note: format!(" (accepted +{delta:.2})"),
539 hook_decisions,
540 validation_commands,
541 final_validation_commands,
542 rollback_succeeded: None,
543 rollback_error: None,
544 timed_out: false,
545 rejection: None,
546 }
547 } else {
548 let rollback_result = mdx_rust_analysis::editing::restore_file(&snapshot);
549 let rollback_error = rollback_result.as_ref().err().map(ToString::to_string);
550 let rollback_succeeded = rollback_result.is_ok();
551 let _ = mdx_rust_analysis::editing::validate_build(&agent.path);
552 if !context.config.quiet {
553 println!(
554 " [Final Validation Failed] Change rolled back after re-validation failed."
555 );
556 }
557 CandidateExecutionOutcome {
558 validated: 1,
559 landed: 0,
560 accepted: 0,
561 accepted_diff: None,
562 patched_score: Some(patched_score),
563 holdout_score: None,
564 delta: Some(delta),
565 note: " (final validation failed and rolled back)".to_string(),
566 hook_decisions,
567 validation_commands,
568 final_validation_commands,
569 rollback_succeeded: Some(rollback_succeeded),
570 rollback_error,
571 timed_out: false,
572 rejection: Some(SafetyRejection {
573 kind: SafetyRejectionKind::FinalValidationFailed,
574 message: "final validation failed and rolled back".to_string(),
575 }),
576 }
577 }
578}
579
580fn timed_out_outcome(
581 timeout: Duration,
582 hook_decisions: Vec<HookDecision>,
583) -> CandidateExecutionOutcome {
584 CandidateExecutionOutcome {
585 timed_out: true,
586 ..CandidateExecutionOutcome::rejected(
587 SafetyRejectionKind::Timeout,
588 format!(" (candidate timed out after {}s)", timeout.as_secs()),
589 hook_decisions,
590 )
591 }
592}
593
594fn remaining_budget(start: Instant, total: Duration) -> Option<Duration> {
595 total
596 .checked_sub(start.elapsed())
597 .filter(|remaining| !remaining.is_zero())
598}
599
600fn ensure_single_file_patch_scope(agent_root: &Path, edit: &ProposedEdit) -> anyhow::Result<()> {
601 let expected = if edit.file.is_absolute() {
602 edit.file.strip_prefix(agent_root).map_err(|_| {
603 anyhow::anyhow!("edit file is outside agent root: {}", edit.file.display())
604 })?
605 } else {
606 edit.file.as_path()
607 };
608
609 for line in edit.patch.lines() {
610 for path in diff_paths_from_line(line) {
611 if path == "/dev/null" {
612 continue;
613 }
614
615 if Path::new(&path) != expected {
616 anyhow::bail!(
617 "patch touches {}, but ProposedEdit.file is {}",
618 path,
619 expected.display()
620 );
621 }
622 }
623 }
624
625 Ok(())
626}
627
628fn diff_paths_from_line(line: &str) -> Vec<String> {
629 if let Some(path) = line
630 .strip_prefix("+++ ")
631 .or_else(|| line.strip_prefix("--- "))
632 {
633 return normalize_diff_path(path).into_iter().collect();
634 }
635
636 if let Some(rest) = line.strip_prefix("diff --git ") {
637 return rest
638 .split_whitespace()
639 .filter_map(normalize_diff_path)
640 .collect();
641 }
642
643 for prefix in ["rename from ", "rename to ", "copy from ", "copy to "] {
644 if let Some(path) = line.strip_prefix(prefix) {
645 return normalize_diff_path(path).into_iter().collect();
646 }
647 }
648
649 if let Some(rest) = line.strip_prefix("Binary files ") {
650 if let Some((left, right_with_suffix)) = rest.split_once(" and ") {
651 let right = right_with_suffix
652 .strip_suffix(" differ")
653 .unwrap_or(right_with_suffix);
654 return [left, right]
655 .into_iter()
656 .filter_map(normalize_diff_path)
657 .collect();
658 }
659 }
660
661 Vec::new()
662}
663
664fn normalize_diff_path(raw: &str) -> Option<String> {
665 let path = raw.trim().trim_matches('"');
666 if path == "/dev/null" {
667 return Some(path.to_string());
668 }
669
670 path.strip_prefix("a/")
671 .or_else(|| path.strip_prefix("b/"))
672 .or(Some(path))
673 .map(str::to_string)
674}
675
676async fn evaluate_workspace(
677 dir: &std::path::Path,
678 inputs: &[serde_json::Value],
679 scorer: fn(&AgentRunResult) -> f32,
680) -> anyhow::Result<f32> {
681 let temp_agent = RegisteredAgent {
682 name: "isolated-eval".to_string(),
683 path: dir.to_path_buf(),
684 contract: AgentContract::Process,
685 registered_at: "".to_string(),
686 };
687
688 let mut scores = vec![];
689 for input in inputs {
690 let res = crate::runner::run_agent(&temp_agent, input.clone()).await?;
691 scores.push(scorer(&res));
692 }
693 if scores.is_empty() {
694 return Ok(0.0);
695 }
696 Ok(scores.iter().sum::<f32>() / scores.len() as f32)
697}
698
699#[cfg(test)]
700mod tests {
701 use super::*;
702 use crate::optimizer::mechanical_score;
703 use proptest::prelude::*;
704 use tempfile::tempdir;
705
706 fn temp_agent_source(answer_suffix: &str) -> String {
707 r#"use std::io::BufRead;
708
709fn main() {
710 let mut input = String::new();
711 std::io::stdin().lock().read_line(&mut input).unwrap();
712 println!("{{\"answer\":\"A stable useful answer __SUFFIX__\",\"confidence\":0.70,\"reasoning\":\"Think step by step.\"}}");
713}
714"#
715 .replace("__SUFFIX__", answer_suffix)
716 }
717
718 fn write_temp_agent(with_final_failure_marker: bool) -> (tempfile::TempDir, RegisteredAgent) {
719 let dir = tempdir().unwrap();
720 std::fs::create_dir_all(dir.path().join("src")).unwrap();
721 std::fs::write(
722 dir.path().join("Cargo.toml"),
723 "[package]\nname=\"safety-agent\"\nversion=\"0.1.0\"\nedition=\"2021\"\n",
724 )
725 .unwrap();
726 std::fs::write(dir.path().join("src/main.rs"), temp_agent_source("before")).unwrap();
727
728 if with_final_failure_marker {
729 std::fs::write(
730 dir.path().join("build.rs"),
731 r#"
732fn main() {
733 if std::path::Path::new(".mdx-rust/fail-final").exists() {
734 panic!("intentional final validation failure");
735 }
736}
737"#,
738 )
739 .unwrap();
740 std::fs::create_dir_all(dir.path().join(".mdx-rust")).unwrap();
741 std::fs::write(dir.path().join(".mdx-rust/fail-final"), "1").unwrap();
742 }
743
744 let agent = RegisteredAgent {
745 name: "safety-agent".to_string(),
746 path: dir.path().to_path_buf(),
747 contract: AgentContract::Process,
748 registered_at: "test".to_string(),
749 };
750
751 (dir, agent)
752 }
753
754 fn comment_patch() -> String {
755 "diff --git a/src/main.rs b/src/main.rs\n--- a/src/main.rs\n+++ b/src/main.rs\n@@ -1,5 +1,6 @@\n use std::io::BufRead;\n+// mdx safety invariant test\n \n fn main() {\n let mut input = String::new();\n std::io::stdin().lock().read_line(&mut input).unwrap();\n"
756 .to_string()
757 }
758
759 fn improved_patch() -> String {
760 "diff --git a/src/main.rs b/src/main.rs\n--- a/src/main.rs\n+++ b/src/main.rs\n@@ -2,6 +2,6 @@ use std::io::BufRead;\n \n fn main() {\n let mut input = String::new();\n std::io::stdin().lock().read_line(&mut input).unwrap();\n- println!(\"{{\\\"answer\\\":\\\"A stable useful answer before\\\",\\\"confidence\\\":0.70,\\\"reasoning\\\":\\\"Think step by step.\\\"}}\");\n+ println!(\"{{\\\"answer\\\":\\\"A stable useful answer after with much more useful detail\\\",\\\"confidence\\\":0.70,\\\"reasoning\\\":\\\"Think step by step.\\\"}}\");\n }\n"
761 .to_string()
762 }
763
764 fn execution_config<'a>(policy: &'a HookPolicy) -> CandidateExecutionConfig<'a> {
765 CandidateExecutionConfig {
766 hook_policy: policy,
767 review_before_apply: false,
768 quiet: true,
769 candidate_timeout: Duration::from_secs(30),
770 }
771 }
772
773 #[tokio::test]
774 async fn deny_hook_cannot_accept_or_validate() {
775 let (_dir, agent) = write_temp_agent(false);
776 let policy = HookPolicy {
777 max_patch_bytes: 1,
778 require_positive_delta: true,
779 };
780 let edit = ProposedEdit {
781 file: agent.path.join("src/main.rs"),
782 description: "too large".to_string(),
783 patch: comment_patch(),
784 };
785
786 let outcome = execute_candidate_edit(CandidateExecutionContext {
787 agent: &agent,
788 config: execution_config(&policy),
789 iteration: 0,
790 candidate_index: 0,
791 edit: &edit,
792 test_inputs: &[serde_json::json!({"query":"hi"})],
793 holdout_inputs: &[],
794 baseline_score: 0.0,
795 scorer: mechanical_score,
796 })
797 .await;
798
799 assert_eq!(outcome.validated, 0);
800 assert_eq!(outcome.landed, 0);
801 assert_eq!(outcome.accepted, 0);
802 assert!(outcome
803 .hook_decisions
804 .iter()
805 .any(|decision| decision.denied()));
806 }
807
808 #[tokio::test]
809 async fn net_negative_candidate_is_rejected_before_landing() {
810 let (_dir, agent) = write_temp_agent(false);
811 let before = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
812 let policy = HookPolicy::default();
813 let edit = ProposedEdit {
814 file: agent.path.join("src/main.rs"),
815 description: "comment only".to_string(),
816 patch: comment_patch(),
817 };
818
819 let outcome = execute_candidate_edit(CandidateExecutionContext {
820 agent: &agent,
821 config: execution_config(&policy),
822 iteration: 0,
823 candidate_index: 0,
824 edit: &edit,
825 test_inputs: &[serde_json::json!({"query":"hi"})],
826 holdout_inputs: &[],
827 baseline_score: 0.95,
828 scorer: mechanical_score,
829 })
830 .await;
831
832 let after = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
833 assert!(
834 outcome.note.is_empty() || !outcome.note.contains("validation rejected"),
835 "{}",
836 outcome.note
837 );
838 assert_eq!(outcome.validated, 1, "{}", outcome.note);
839 assert_eq!(outcome.landed, 0);
840 assert_eq!(outcome.accepted, 0);
841 assert_eq!(before, after);
842 }
843
844 #[tokio::test]
845 async fn final_validation_failure_rolls_back_and_does_not_accept() {
846 let (_dir, agent) = write_temp_agent(true);
847 let before = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
848 let policy = HookPolicy::default();
849 let edit = ProposedEdit {
850 file: agent.path.join("src/main.rs"),
851 description: "improve answer".to_string(),
852 patch: improved_patch(),
853 };
854
855 let outcome = execute_candidate_edit(CandidateExecutionContext {
856 agent: &agent,
857 config: execution_config(&policy),
858 iteration: 0,
859 candidate_index: 0,
860 edit: &edit,
861 test_inputs: &[serde_json::json!({"query":"hi"})],
862 holdout_inputs: &[],
863 baseline_score: 0.40,
864 scorer: mechanical_score,
865 })
866 .await;
867
868 let after = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
869 assert!(
870 outcome.note.is_empty() || !outcome.note.contains("validation rejected"),
871 "{}",
872 outcome.note
873 );
874 assert_eq!(outcome.validated, 1, "{}", outcome.note);
875 assert_eq!(outcome.landed, 0);
876 assert_eq!(outcome.accepted, 0);
877 assert_eq!(before, after);
878 }
879
880 #[tokio::test]
881 async fn patch_scope_mismatch_is_rejected_before_validation() {
882 let (_dir, agent) = write_temp_agent(false);
883 let policy = HookPolicy::default();
884 let before = std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap();
885 let edit = ProposedEdit {
886 file: agent.path.join("src/main.rs"),
887 description: "bad multi-file patch".to_string(),
888 patch: "diff --git a/src/lib.rs b/src/lib.rs\n--- a/src/lib.rs\n+++ b/src/lib.rs\n@@ -1,1 +1,1 @@\n-a\n+b\n".to_string(),
889 };
890
891 let outcome = execute_candidate_edit(CandidateExecutionContext {
892 agent: &agent,
893 config: execution_config(&policy),
894 iteration: 0,
895 candidate_index: 0,
896 edit: &edit,
897 test_inputs: &[serde_json::json!({"query":"hi"})],
898 holdout_inputs: &[],
899 baseline_score: 0.40,
900 scorer: mechanical_score,
901 })
902 .await;
903
904 assert_eq!(outcome.validated, 0);
905 assert_eq!(outcome.landed, 0);
906 assert_eq!(outcome.accepted, 0);
907 assert!(outcome.note.contains("edit scope rejected"));
908 assert_eq!(
909 std::fs::read_to_string(agent.path.join("src/main.rs")).unwrap(),
910 before
911 );
912 }
913
914 #[tokio::test]
915 async fn diff_git_scope_mismatch_is_rejected_before_validation() {
916 let (_dir, agent) = write_temp_agent(false);
917 let policy = HookPolicy::default();
918 let edit = ProposedEdit {
919 file: agent.path.join("src/main.rs"),
920 description: "bad diff header".to_string(),
921 patch: "diff --git a/src/main.rs b/src/lib.rs\n--- a/src/main.rs\n+++ b/src/lib.rs\n@@ -1,1 +1,1 @@\n-a\n+b\n".to_string(),
922 };
923
924 let outcome = execute_candidate_edit(CandidateExecutionContext {
925 agent: &agent,
926 config: execution_config(&policy),
927 iteration: 0,
928 candidate_index: 0,
929 edit: &edit,
930 test_inputs: &[serde_json::json!({"query":"hi"})],
931 holdout_inputs: &[],
932 baseline_score: 0.40,
933 scorer: mechanical_score,
934 })
935 .await;
936
937 assert_eq!(outcome.validated, 0);
938 assert_eq!(outcome.landed, 0);
939 assert_eq!(outcome.accepted, 0);
940 assert!(outcome.note.contains("edit scope rejected"));
941 }
942
943 #[tokio::test]
944 async fn rename_scope_mismatch_is_rejected_before_validation() {
945 let (_dir, agent) = write_temp_agent(false);
946 let policy = HookPolicy::default();
947 let edit = ProposedEdit {
948 file: agent.path.join("src/main.rs"),
949 description: "bad rename".to_string(),
950 patch: "diff --git a/src/main.rs b/src/lib.rs\nsimilarity index 100%\nrename from src/main.rs\nrename to src/lib.rs\n".to_string(),
951 };
952
953 let outcome = execute_candidate_edit(CandidateExecutionContext {
954 agent: &agent,
955 config: execution_config(&policy),
956 iteration: 0,
957 candidate_index: 0,
958 edit: &edit,
959 test_inputs: &[serde_json::json!({"query":"hi"})],
960 holdout_inputs: &[],
961 baseline_score: 0.40,
962 scorer: mechanical_score,
963 })
964 .await;
965
966 assert_eq!(outcome.validated, 0);
967 assert_eq!(outcome.landed, 0);
968 assert_eq!(outcome.accepted, 0);
969 assert!(outcome.note.contains("edit scope rejected"));
970 }
971
972 #[tokio::test]
973 async fn exhausted_candidate_timeout_stops_before_validation() {
974 let (_dir, agent) = write_temp_agent(false);
975 let policy = HookPolicy::default();
976 let edit = ProposedEdit {
977 file: agent.path.join("src/main.rs"),
978 description: "comment only".to_string(),
979 patch: comment_patch(),
980 };
981 let config = CandidateExecutionConfig {
982 hook_policy: &policy,
983 review_before_apply: false,
984 quiet: true,
985 candidate_timeout: Duration::from_secs(0),
986 };
987
988 let outcome = execute_candidate_edit(CandidateExecutionContext {
989 agent: &agent,
990 config,
991 iteration: 0,
992 candidate_index: 0,
993 edit: &edit,
994 test_inputs: &[serde_json::json!({"query":"hi"})],
995 holdout_inputs: &[],
996 baseline_score: 0.40,
997 scorer: mechanical_score,
998 })
999 .await;
1000
1001 assert!(outcome.timed_out);
1002 assert_eq!(outcome.validated, 0);
1003 assert_eq!(outcome.landed, 0);
1004 assert_eq!(outcome.accepted, 0);
1005 assert_eq!(
1006 outcome.rejection.as_ref().map(|rejection| &rejection.kind),
1007 Some(&SafetyRejectionKind::Timeout)
1008 );
1009 }
1010
1011 proptest! {
1012 #[test]
1013 fn normalized_diff_paths_remove_only_diff_side_prefixes(path in "[a-zA-Z0-9_./-]{1,64}") {
1014 let line = format!("diff --git a/{path} b/{path}");
1015 let paths = diff_paths_from_line(&line);
1016
1017 prop_assert_eq!(paths, vec![path.clone(), path]);
1018 }
1019
1020 #[test]
1021 fn pre_accept_policy_denies_all_non_positive_deltas(delta in -10.0f32..=0.0f32) {
1022 let decision = evaluate_builtin_hook(
1023 &HookPolicy::default(),
1024 &HookContext {
1025 stage: HookStage::PreAccept,
1026 agent_name: "agent".to_string(),
1027 edit_description: None,
1028 patch_bytes: 0,
1029 command: None,
1030 validation_passed: Some(true),
1031 score_delta: Some(delta),
1032 },
1033 );
1034
1035 prop_assert!(decision.denied());
1036 }
1037 }
1038}