car-server-core 0.24.1

Transport-neutral library for the CAR daemon JSON-RPC dispatcher (used by car-server and tokhn-daemon)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
//! The native coding loop: plan → edit → verify → repair, on CAR inference.
//!
//! Shape mirrors `car-bench`'s `InferenceAgentRunner` (multi-turn tool-use
//! conversation) wrapped in `car-builder`'s repair-loop philosophy: each
//! iteration is a fresh conversation seeded with the intent plus the previous
//! iteration's failing check output, and the loop only exits green when
//! [`evaluate_contract`] — not the model — says so.

use std::sync::atomic::Ordering;

use async_trait::async_trait;
use car_engine::ToolExecutor;
use car_inference::tasks::generate::Message;
use car_inference::{GenerateParams, GenerateRequest, InferenceEngine, InferenceResult};
use serde_json::Value;

use super::contract::{evaluate_contract, CheckResult, OutcomeContract};
use super::session::{CancelFlag, CoderEventKind, EventSink};
use super::shell_tool::WorktreeExecutor;
use super::skill_memory::{FailureSignature, RepairMemory};

/// The model seam: one turn of generation. Implemented by
/// [`InferenceEngine`] for production; test harnesses script it (the same
/// injected-generation philosophy as `car-builder`).
#[async_trait]
pub trait TurnGenerator: Send + Sync {
    async fn generate(&self, req: GenerateRequest) -> Result<InferenceResult, String>;
}

#[async_trait]
impl TurnGenerator for InferenceEngine {
    async fn generate(&self, req: GenerateRequest) -> Result<InferenceResult, String> {
        self.generate_tracked(req).await.map_err(|e| e.to_string())
    }
}

/// The mid-session user-input seam: the loop hands a prompt to the host, which
/// surfaces it (emit `UserInputRequested`), blocks for the user's reply (while
/// respecting cancellation and a bound), and returns the text — or an `Err`
/// describing why no answer is coming (timeout, cancellation, no listener). An
/// `Err` is fed back to the model as a tool error so it can proceed without the
/// answer rather than the loop wedging.
///
/// Optional by design: when no asker is wired (most tests, the foreman/external
/// fallbacks), the `ask_user` tool is simply not offered to the model.
#[async_trait]
pub trait AskUser: Send + Sync {
    async fn ask(&self, prompt: &str) -> Result<String, String>;
}

/// The name of the model-invokable mid-session question tool. Recognized by the
/// loop (not the `WorktreeExecutor`) so the channel plumbing stays with the
/// sink + cancel flag the loop already holds.
pub const ASK_USER_TOOL: &str = "ask_user";

/// Tool definition for [`ASK_USER_TOOL`], appended to the model-visible tool
/// list only when an [`AskUser`] handler is wired.
fn ask_user_tool_def() -> Value {
    serde_json::json!({
        "name": ASK_USER_TOOL,
        "description": "Ask the human user a question and wait for their reply. \
                        Use ONLY when you genuinely cannot proceed without a \
                        decision or missing fact the user alone can supply (an \
                        ambiguous requirement, a destructive choice, a missing \
                        credential). Do not use it for things you can determine \
                        by reading the repo or running commands. The call blocks \
                        until the user answers or a timeout elapses; on timeout \
                        you receive an error and should proceed with your best \
                        judgment.",
        "parameters": {
            "type": "object",
            "properties": {
                "prompt": {
                    "type": "string",
                    "description": "The question to show the user, phrased so a short reply answers it."
                }
            },
            "required": ["prompt"]
        }
    })
}

/// Tuning for the native loop.
#[derive(Debug, Clone)]
pub struct NativeLoopConfig {
    /// Pinned model id; `None` routes adaptively (TaskHint::Code).
    pub model: Option<String>,
    /// Contract-evaluation rounds before giving up.
    pub max_iterations: u32,
    /// Model turns within one iteration before forcing evaluation.
    pub max_turns_per_iteration: u32,
    /// Generation budget per turn.
    pub max_tokens_per_turn: usize,
}

impl Default for NativeLoopConfig {
    fn default() -> Self {
        Self {
            model: None,
            max_iterations: 8,
            max_turns_per_iteration: 24,
            max_tokens_per_turn: 4096,
        }
    }
}

/// How a loop run ended.
#[derive(Debug, Clone)]
pub struct LoopOutcome {
    /// Every contract check passed.
    pub passed: bool,
    /// Iterations actually executed.
    pub iterations: u32,
    /// Check results from the final evaluation.
    pub last_results: Vec<CheckResult>,
    /// Terminal error (cancellation, repeated inference failure). `None` for
    /// a clean pass OR a clean exhaustion of iterations.
    pub error: Option<String>,
}

fn preview(s: &str, max: usize) -> String {
    if s.len() <= max {
        return s.to_string();
    }
    let mut end = max;
    while !s.is_char_boundary(end) {
        end -= 1;
    }
    format!("{}", &s[..end])
}

fn system_prompt(contract: &OutcomeContract) -> String {
    format!(
        "You are CAR Coder, an autonomous coding agent working in an isolated git worktree \
         of the user's repository. The worktree root is your working directory; all relative \
         paths resolve against it.\n\n\
         Rules:\n\
         - Inspect before you edit: read the relevant files first.\n\
         - Use the shell tool for builds and tests. Policy denies git push, sudo, and \
           destructive operations outside the worktree — do not attempt them.\n\
         - Do not git commit; the runtime handles version control.\n\
         - When you believe the work is complete, reply with a brief plain-text summary and \
           STOP calling tools. The runtime then verifies the outcome contract itself; if \
           checks fail you will be re-invoked with their output.\n\n\
         OUTCOME CONTRACT (the runtime runs these to decide done):\n{}",
        contract.render()
    )
}

fn failure_feedback(results: &[CheckResult]) -> String {
    let mut msg = String::from(
        "The outcome contract was evaluated and some checks FAILED. Fix the code so they pass.\n\n",
    );
    for r in results.iter().filter(|r| !r.passed) {
        msg.push_str(&format!(
            "FAILED {} (exit {:?}):\n{}\n\n",
            r.name, r.exit_code, r.output_tail
        ));
    }
    msg
}

/// The signature of the most-relevant failure in a red evaluation: the first
/// failing check. One signature per repair round keeps learning attributable.
fn primary_failure(results: &[CheckResult]) -> Option<FailureSignature> {
    results
        .iter()
        .find(|r| !r.passed)
        .map(FailureSignature::from_check)
}

/// Append a recalled repair hint to the repair prompt. Kept terse and clearly
/// labelled as a heuristic from a prior session so the model treats it as a
/// lead, not gospel.
fn append_recall_hint(prompt: &mut String, hint: &str) {
    prompt.push_str(
        "\nHINT — a prior session resolved this same failure signature with this approach; \
         use it as a lead, verify it still applies:\n",
    );
    prompt.push_str(hint);
    prompt.push('\n');
}

/// Distill a durable, reusable approach summary from the iteration that turned
/// the contract green. The model's closing plan text (its own summary of what
/// it did) is the best signal; fall back to a generic marker when it stayed
/// silent so the skill still records that *something* fixed this signature.
fn winning_approach(sig: &FailureSignature, plan_text: &str) -> String {
    let plan = plan_text.trim();
    if plan.is_empty() {
        format!(
            "Re-attempted the edit; the '{}' failure of check '{}' cleared after repair.",
            sig.error_class, sig.check
        )
    } else {
        preview(plan, 1024)
    }
}

/// Drive the native loop to completion, cancellation, or exhaustion.
///
/// When `ask` is `Some`, the model is additionally offered the [`ASK_USER_TOOL`]
/// to request mid-session input; the loop routes that one tool to the handler
/// (not the worktree executor) so a question blocks on the user-input gate while
/// honoring cancellation and the gate's timeout.
#[allow(clippy::too_many_arguments)]
pub async fn run_native_loop(
    inference: &dyn TurnGenerator,
    executor: &WorktreeExecutor,
    intent: &str,
    contract: &OutcomeContract,
    sink: &EventSink,
    cancel: &CancelFlag,
    cfg: &NativeLoopConfig,
    memory: &RepairMemory,
    ask: Option<&dyn AskUser>,
) -> LoopOutcome {
    let mut tools = WorktreeExecutor::tool_defs();
    if ask.is_some() {
        tools.push(ask_user_tool_def());
    }
    let system = system_prompt(contract);
    let mut feedback: Option<String> = None;
    let mut last_results: Vec<CheckResult> = Vec::new();
    let mut consecutive_inference_failures = 0u32;
    // The signature of the failure carried into THIS iteration's repair, if
    // any. Drives skill recall (inject a prior fix) and outcome attribution
    // (credit/penalize that signature's skill once we know if the repair held).
    let mut prior_sig: Option<FailureSignature> = None;

    for iteration in 1..=cfg.max_iterations {
        if cancel.load(Ordering::SeqCst) {
            return LoopOutcome {
                passed: false,
                iterations: iteration - 1,
                last_results,
                error: Some("cancelled".into()),
            };
        }
        sink.emit(CoderEventKind::IterationStarted {
            n: iteration,
            max: cfg.max_iterations,
        });

        // Fresh conversation per iteration: keeps context bounded for small
        // models and makes each repair attempt self-contained. The failing
        // check output is the only state carried forward.
        let mut user = format!("Task:\n{intent}\n");
        if let Some(fb) = &feedback {
            user.push_str("\n");
            user.push_str(fb);
        }
        // Durable recall: if this iteration is repairing a known failure
        // signature and a prior session learned a fix for it, inject that fix
        // as a lead. No-op when learning is disabled or nothing matches.
        if let Some(sig) = &prior_sig {
            if let Some(hint) = memory.recall(sig).await {
                append_recall_hint(&mut user, &hint);
            }
        }
        let mut messages = vec![
            Message::System {
                content: system.clone(),
            },
            Message::User { content: user },
        ];

        // The model's closing summary for this iteration — captured to distill
        // a durable repair skill if this iteration turns the contract green.
        let mut closing_plan = String::new();
        // Inner tool-use conversation.
        let mut turn = 0;
        while turn < cfg.max_turns_per_iteration {
            turn += 1;
            if cancel.load(Ordering::SeqCst) {
                return LoopOutcome {
                    passed: false,
                    iterations: iteration,
                    last_results,
                    error: Some("cancelled".into()),
                };
            }

            let req = GenerateRequest {
                prompt: intent.to_string(), // ignored when messages are set
                model: cfg.model.clone(),
                params: GenerateParams {
                    temperature: 0.0,
                    max_tokens: cfg.max_tokens_per_turn,
                    ..Default::default()
                },
                tools: Some(tools.clone()),
                messages: Some(messages.clone()),
                intent: Some(car_inference::IntentHint {
                    task: Some(car_inference::TaskHint::Code),
                    ..Default::default()
                }),
                ..Default::default()
            };

            let result = match inference.generate(req).await {
                Ok(r) => {
                    consecutive_inference_failures = 0;
                    r
                }
                Err(e) => {
                    consecutive_inference_failures += 1;
                    sink.emit(CoderEventKind::Error {
                        message: format!("inference failed (turn {turn}): {e}"),
                    });
                    if consecutive_inference_failures >= 3 {
                        return LoopOutcome {
                            passed: false,
                            iterations: iteration,
                            last_results,
                            error: Some(format!("inference failed repeatedly: {e}")),
                        };
                    }
                    continue; // retry the same turn
                }
            };

            if result.tool_calls.is_empty() {
                // Model says done — break to contract evaluation.
                if !result.text.trim().is_empty() {
                    closing_plan = result.text.clone();
                    sink.emit(CoderEventKind::PlanText {
                        text: result.text.clone(),
                    });
                }
                break;
            }

            // Assign ids so ToolResult replies correlate (local models may
            // omit them), then execute sequentially in emitted order.
            let mut calls = result.tool_calls.clone();
            for (i, call) in calls.iter_mut().enumerate() {
                if call.id.is_none() {
                    call.id = Some(format!("call_{iteration}_{turn}_{i}"));
                }
            }
            messages.push(Message::Assistant {
                content: result.text.clone(),
                tool_calls: calls.clone(),
            });

            for call in &calls {
                let params = Value::Object(call.arguments.clone().into_iter().collect());
                sink.emit(CoderEventKind::ToolCall {
                    tool: call.name.clone(),
                    params_preview: preview(&params.to_string(), 400),
                });
                let (ok, content) = if call.name == ASK_USER_TOOL {
                    // Route to the user-input handler, not the worktree executor.
                    // The handler emits UserInputRequested, blocks on the gate
                    // (honoring cancel + timeout), and returns the user's text or
                    // an error the model can recover from.
                    match ask {
                        Some(asker) => {
                            let prompt = params
                                .get("prompt")
                                .and_then(Value::as_str)
                                .unwrap_or("")
                                .to_string();
                            match asker.ask(&prompt).await {
                                Ok(answer) => (true, answer),
                                Err(e) => (false, format!("ERROR: {e}")),
                            }
                        }
                        None => (
                            false,
                            "ERROR: ask_user is not available in this session".to_string(),
                        ),
                    }
                } else {
                    match executor.execute(&call.name, &params).await {
                        Ok(v) => (true, v.to_string()),
                        Err(e) => (false, format!("ERROR: {e}")),
                    }
                };
                sink.emit(CoderEventKind::ToolResult {
                    tool: call.name.clone(),
                    ok,
                    preview: preview(&content, 400),
                });
                messages.push(Message::ToolResult {
                    tool_use_id: call.id.clone().expect("assigned above"),
                    content: preview(&content, 16 * 1024),
                });
            }
        }

        // Verify. The contract — not the model's self-report — decides.
        last_results = evaluate_contract(contract, executor, sink).await;
        if last_results.iter().all(|r| r.passed) {
            // Durable learning: if THIS green came after a prior failure, the
            // repair held — credit (or ingest) the skill for that signature so
            // the next occurrence can recall the winning approach.
            if let Some(sig) = &prior_sig {
                memory
                    .record_success(sig, &winning_approach(sig, &closing_plan))
                    .await;
            }
            return LoopOutcome {
                passed: true,
                iterations: iteration,
                last_results,
                error: None,
            };
        }
        // Still red. Record the failure against its signature (penalizing any
        // recalled approach that didn't hold) and carry it into the next
        // repair round for recall + attribution.
        feedback = Some(failure_feedback(&last_results));
        if let Some(sig) = primary_failure(&last_results) {
            memory.record_failure(&sig).await;
            prior_sig = Some(sig);
        } else {
            prior_sig = None;
        }
    }

    LoopOutcome {
        passed: false,
        iterations: cfg.max_iterations,
        last_results,
        error: None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::coder::contract::ContractCheck;
    use std::sync::atomic::AtomicUsize;
    use std::sync::Arc;

    /// Scripted generator: pops pre-canned turns in order.
    struct Script {
        turns: Vec<InferenceResult>,
        cursor: AtomicUsize,
    }

    fn turn(text: &str, tool_calls: serde_json::Value) -> InferenceResult {
        serde_json::from_value(serde_json::json!({
            "text": text,
            "tool_calls": tool_calls,
            "trace_id": "t",
            "model_used": "scripted",
            "latency_ms": 0,
        }))
        .expect("scripted InferenceResult shape")
    }

    #[async_trait]
    impl TurnGenerator for Script {
        async fn generate(&self, _req: GenerateRequest) -> Result<InferenceResult, String> {
            let i = self.cursor.fetch_add(1, Ordering::SeqCst);
            self.turns
                .get(i)
                .cloned()
                .ok_or_else(|| "script exhausted".to_string())
        }
    }

    #[tokio::test]
    async fn scripted_loop_edits_verifies_and_passes() {
        let dir = tempfile::tempdir().unwrap();
        let executor = WorktreeExecutor::new(dir.path());
        let (sink, collected) = EventSink::collecting("coder-native");
        let cancel: CancelFlag = Arc::new(std::sync::atomic::AtomicBool::new(false));

        // Turn 1: write the file. Turn 2: declare done (no tool calls).
        let script = Script {
            turns: vec![
                turn(
                    "creating the file",
                    serde_json::json!([{
                        "id": "c1",
                        "name": "write_file",
                        "arguments": {"path": "hello.txt", "content": "hello coder"}
                    }]),
                ),
                turn("done — file created", serde_json::json!([])),
            ],
            cursor: AtomicUsize::new(0),
        };
        let contract = OutcomeContract {
            description: "hello.txt exists with content".into(),
            checks: vec![ContractCheck {
                name: "exists".into(),
                command: "grep -q 'hello coder' hello.txt".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 10,
            }],
        };

        let outcome = run_native_loop(
            &script,
            &executor,
            "create hello.txt containing 'hello coder'",
            &contract,
            &sink,
            &cancel,
            &NativeLoopConfig::default(),
            &RepairMemory::disabled(),
            None,
        )
        .await;

        assert!(outcome.passed, "outcome: {outcome:?}");
        assert_eq!(outcome.iterations, 1);
        assert!(dir.path().join("hello.txt").exists());

        // Event stream shape: iteration → tool call/result → plan → check.
        let events = collected.lock().unwrap();
        let types: Vec<&str> = events
            .iter()
            .map(|e| match &e.kind {
                CoderEventKind::IterationStarted { .. } => "iteration",
                CoderEventKind::ToolCall { .. } => "tool_call",
                CoderEventKind::ToolResult { .. } => "tool_result",
                CoderEventKind::PlanText { .. } => "plan",
                CoderEventKind::CheckStarted { .. } => "check_started",
                CoderEventKind::CheckCompleted { .. } => "check_completed",
                _ => "other",
            })
            .collect();
        assert_eq!(
            types,
            vec!["iteration", "tool_call", "tool_result", "plan", "check_started", "check_completed"]
        );
    }

    #[tokio::test]
    async fn scripted_loop_repairs_after_red_checks() {
        let dir = tempfile::tempdir().unwrap();
        let executor = WorktreeExecutor::new(dir.path());
        let sink = EventSink::test_sink();
        let cancel: CancelFlag = Arc::new(std::sync::atomic::AtomicBool::new(false));

        // Iter 1: writes the WRONG content, says done → check fails.
        // Iter 2: fixes it → check passes.
        let script = Script {
            turns: vec![
                turn(
                    "",
                    serde_json::json!([{
                        "id": "c1", "name": "write_file",
                        "arguments": {"path": "x.txt", "content": "wrong"}
                    }]),
                ),
                turn("done", serde_json::json!([])),
                turn(
                    "",
                    serde_json::json!([{
                        "id": "c2", "name": "write_file",
                        "arguments": {"path": "x.txt", "content": "right"}
                    }]),
                ),
                turn("fixed", serde_json::json!([])),
            ],
            cursor: AtomicUsize::new(0),
        };
        let contract = OutcomeContract {
            description: "x.txt says right".into(),
            checks: vec![ContractCheck {
                name: "content".into(),
                command: "grep -q right x.txt".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 10,
            }],
        };

        let outcome = run_native_loop(
            &script, &executor, "write right into x.txt", &contract, &sink, &cancel,
            &NativeLoopConfig::default(), &RepairMemory::disabled(), None,
        )
        .await;
        assert!(outcome.passed);
        assert_eq!(outcome.iterations, 2, "one repair round expected");
    }

    /// End-to-end learning: a repair (red → green) records a durable skill
    /// keyed on the failure signature, and a *fresh* session that hits the same
    /// signature recalls that approach into its repair prompt.
    #[tokio::test]
    async fn repair_round_learns_and_recalls_across_sessions() {
        use crate::coder::skill_memory::FailureSignature;
        use car_memgine::MemgineEngine;
        use tokio::sync::Mutex as AsyncMutex;

        // A shared memgine survives both sessions (the "gets better" store).
        let memory = RepairMemory::new(Some(Arc::new(AsyncMutex::new(MemgineEngine::new(None)))));

        // A contract whose check fails LOUDLY with a recognizable error class
        // so the signature is stable across sessions.
        let contract = OutcomeContract {
            description: "x.txt says right".into(),
            checks: vec![ContractCheck {
                name: "content".into(),
                command: "grep -q right x.txt || { echo 'assertion failed'; exit 1; }".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 10,
            }],
        };
        let sig = FailureSignature {
            check: "content".into(),
            error_class: "test_failure".into(),
        };

        // --- Session 1: red then green. The green-after-red ingests the skill.
        let dir1 = tempfile::tempdir().unwrap();
        let exec1 = WorktreeExecutor::new(dir1.path());
        let sink = EventSink::test_sink();
        let cancel: CancelFlag = Arc::new(std::sync::atomic::AtomicBool::new(false));
        let script1 = Script {
            turns: vec![
                turn(
                    "",
                    serde_json::json!([{
                        "id": "c1", "name": "write_file",
                        "arguments": {"path": "x.txt", "content": "wrong"}
                    }]),
                ),
                turn("nothing useful yet", serde_json::json!([])),
                turn(
                    "",
                    serde_json::json!([{
                        "id": "c2", "name": "write_file",
                        "arguments": {"path": "x.txt", "content": "right"}
                    }]),
                ),
                turn("wrote 'right' into x.txt to satisfy the grep", serde_json::json!([])),
            ],
            cursor: AtomicUsize::new(0),
        };
        let outcome1 = run_native_loop(
            &script1, &exec1, "write right into x.txt", &contract, &sink, &cancel,
            &NativeLoopConfig::default(), &memory, None,
        )
        .await;
        assert!(outcome1.passed);
        // The winning approach is now durably recallable for this signature.
        let recalled = memory.recall(&sig).await.expect("session 1 should have learned");
        assert!(recalled.contains("right"), "approach captured: {recalled}");

        // --- Session 2: the SAME signature recurs. The loop must inject the
        // recalled hint into the repair prompt on the second iteration.
        let dir2 = tempfile::tempdir().unwrap();
        let exec2 = WorktreeExecutor::new(dir2.path());
        let (sink2, collected) = EventSink::collecting("coder-learn");
        let seen_hint = Arc::new(std::sync::atomic::AtomicBool::new(false));

        // A generator that asserts on the prompt it receives: once a recall
        // hint shows up in the user message, it writes the fix.
        struct HintWatcher {
            seen: Arc<std::sync::atomic::AtomicBool>,
            cursor: AtomicUsize,
        }
        #[async_trait]
        impl TurnGenerator for HintWatcher {
            async fn generate(&self, req: GenerateRequest) -> Result<InferenceResult, String> {
                let i = self.cursor.fetch_add(1, Ordering::SeqCst);
                let saw_hint = req
                    .messages
                    .as_ref()
                    .map(|ms| {
                        ms.iter().any(|m| {
                            matches!(m, Message::User { content } if content.contains("HINT"))
                        })
                    })
                    .unwrap_or(false);
                if saw_hint {
                    self.seen.store(true, Ordering::SeqCst);
                }
                Ok(match i {
                    // Iter 1: do nothing → contract red → signature recorded.
                    0 => turn("did nothing", serde_json::json!([])),
                    // Iter 2 (hint present): write the fix.
                    1 => turn(
                        "",
                        serde_json::json!([{
                            "id": "c1", "name": "write_file",
                            "arguments": {"path": "x.txt", "content": "right"}
                        }]),
                    ),
                    _ => turn("applied the recalled fix", serde_json::json!([])),
                })
            }
        }

        let script2 = HintWatcher {
            seen: seen_hint.clone(),
            cursor: AtomicUsize::new(0),
        };
        let outcome2 = run_native_loop(
            &script2, &exec2, "write right into x.txt", &contract, &sink2, &cancel,
            &NativeLoopConfig::default(), &memory, None,
        )
        .await;
        assert!(outcome2.passed, "session 2 should pass: {outcome2:?}");
        assert!(
            seen_hint.load(Ordering::SeqCst),
            "the recalled hint must have been injected into the repair prompt"
        );
        drop(collected);
    }

    /// The `ask_user` tool routes to the [`AskUser`] handler (not the worktree
    /// executor), emits `UserInputRequested`, and the handler's answer is fed
    /// back to the model as the tool result — which the model then uses.
    #[tokio::test]
    async fn ask_user_tool_routes_to_handler_and_answer_reaches_model() {
        use std::sync::Mutex as StdMutex;

        let dir = tempfile::tempdir().unwrap();
        let executor = WorktreeExecutor::new(dir.path());
        let (sink, collected) = EventSink::collecting("coder-ask");
        let cancel: CancelFlag = Arc::new(std::sync::atomic::AtomicBool::new(false));

        // A canned asker that records the prompt it saw and returns a fixed
        // answer (stands in for the gate + coder.respond round-trip).
        struct CannedAsker {
            seen_prompt: Arc<StdMutex<Option<String>>>,
            answer: String,
        }
        #[async_trait]
        impl AskUser for CannedAsker {
            async fn ask(&self, prompt: &str) -> Result<String, String> {
                *self.seen_prompt.lock().unwrap() = Some(prompt.to_string());
                Ok(self.answer.clone())
            }
        }
        let seen_prompt = Arc::new(StdMutex::new(None));
        let asker = CannedAsker {
            seen_prompt: seen_prompt.clone(),
            answer: "use port 8080".to_string(),
        };

        // The script: ask a question, then (turn 2) write the answer it got
        // back into a file, then declare done. A generator that echoes the
        // ask_user tool result into the write proves the answer reached it.
        struct AskThenWrite {
            cursor: AtomicUsize,
        }
        #[async_trait]
        impl TurnGenerator for AskThenWrite {
            async fn generate(&self, req: GenerateRequest) -> Result<InferenceResult, String> {
                let i = self.cursor.fetch_add(1, Ordering::SeqCst);
                match i {
                    0 => Ok(turn(
                        "",
                        serde_json::json!([{
                            "id": "a1", "name": "ask_user",
                            "arguments": {"prompt": "which port?"}
                        }]),
                    )),
                    1 => {
                        // Pull the answer out of the ToolResult the loop appended.
                        let answer = req
                            .messages
                            .as_ref()
                            .and_then(|ms| {
                                ms.iter().rev().find_map(|m| match m {
                                    Message::ToolResult { content, .. } => Some(content.clone()),
                                    _ => None,
                                })
                            })
                            .unwrap_or_default();
                        Ok(turn(
                            "",
                            serde_json::json!([{
                                "id": "w1", "name": "write_file",
                                "arguments": {"path": "answer.txt", "content": answer}
                            }]),
                        ))
                    }
                    _ => Ok(turn("done", serde_json::json!([]))),
                }
            }
        }

        let contract = OutcomeContract {
            description: "answer.txt records the chosen port".into(),
            checks: vec![ContractCheck {
                name: "has_port".into(),
                command: "grep -q 8080 answer.txt".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 10,
            }],
        };

        let outcome = run_native_loop(
            &AskThenWrite { cursor: AtomicUsize::new(0) },
            &executor,
            "pick a port and record it",
            &contract,
            &sink,
            &cancel,
            &NativeLoopConfig::default(),
            &RepairMemory::disabled(),
            Some(&asker),
        )
        .await;

        assert!(outcome.passed, "outcome: {outcome:?}");
        // The handler saw the model's question.
        assert_eq!(seen_prompt.lock().unwrap().as_deref(), Some("which port?"));
        // The answer reached the model and was written through.
        assert_eq!(
            std::fs::read_to_string(dir.path().join("answer.txt")).unwrap(),
            "use port 8080"
        );
        // The ask_user call surfaced in the event stream as a tool call (the
        // semantic UserInputRequested event is the GateAsker's job, covered by
        // the rpc round-trip test).
        let events = collected.lock().unwrap();
        assert!(events.iter().any(|e| matches!(
            &e.kind,
            CoderEventKind::ToolCall { tool, .. } if tool == ASK_USER_TOOL
        )));
    }

    /// Without an asker the `ask_user` tool is not offered, and if a model calls
    /// it anyway the loop returns a recoverable tool error rather than wedging.
    #[tokio::test]
    async fn ask_user_without_handler_is_a_recoverable_error() {
        let dir = tempfile::tempdir().unwrap();
        let executor = WorktreeExecutor::new(dir.path());
        let (sink, _collected) = EventSink::collecting("coder-noask");
        let cancel: CancelFlag = Arc::new(std::sync::atomic::AtomicBool::new(false));

        // ask_user is absent from the offered tools when ask is None.
        struct ToolPeek {
            offered: Arc<std::sync::atomic::AtomicBool>,
        }
        #[async_trait]
        impl TurnGenerator for ToolPeek {
            async fn generate(&self, req: GenerateRequest) -> Result<InferenceResult, String> {
                let has_ask = req
                    .tools
                    .as_ref()
                    .map(|ts| ts.iter().any(|t| t["name"] == ASK_USER_TOOL))
                    .unwrap_or(false);
                self.offered.store(has_ask, Ordering::SeqCst);
                Ok(turn("done", serde_json::json!([])))
            }
        }
        let offered_flag = Arc::new(std::sync::atomic::AtomicBool::new(false));
        let contract = OutcomeContract {
            description: "noop".into(),
            checks: vec![ContractCheck {
                name: "ok".into(),
                command: "true".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 10,
            }],
        };
        let _ = run_native_loop(
            &ToolPeek { offered: offered_flag.clone() },
            &executor,
            "x",
            &contract,
            &sink,
            &cancel,
            &NativeLoopConfig::default(),
            &RepairMemory::disabled(),
            None,
        )
        .await;
        assert!(
            !offered_flag.load(Ordering::SeqCst),
            "ask_user must not be offered when no handler is wired"
        );
    }

    #[tokio::test]
    async fn cancellation_stops_the_loop() {
        let dir = tempfile::tempdir().unwrap();
        let executor = WorktreeExecutor::new(dir.path());
        let sink = EventSink::test_sink();
        let cancel: CancelFlag = Arc::new(std::sync::atomic::AtomicBool::new(true));
        let script = Script { turns: vec![], cursor: AtomicUsize::new(0) };
        let contract = OutcomeContract {
            description: "d".into(),
            checks: vec![ContractCheck {
                name: "never".into(),
                command: "true".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 10,
            }],
        };
        let outcome = run_native_loop(
            &script, &executor, "x", &contract, &sink, &cancel,
            &NativeLoopConfig::default(), &RepairMemory::disabled(), None,
        )
        .await;
        assert_eq!(outcome.error.as_deref(), Some("cancelled"));
        assert_eq!(outcome.iterations, 0);
    }

    #[test]
    fn failure_feedback_lists_only_failures() {
        let results = vec![
            CheckResult {
                name: "good".into(),
                passed: true,
                exit_code: Some(0),
                output_tail: "ok".into(),
                duration_ms: 1,
            },
            CheckResult {
                name: "bad".into(),
                passed: false,
                exit_code: Some(1),
                output_tail: "assertion failed".into(),
                duration_ms: 1,
            },
        ];
        let fb = failure_feedback(&results);
        assert!(fb.contains("FAILED bad"));
        assert!(fb.contains("assertion failed"));
        assert!(!fb.contains("FAILED good"));
    }

    #[test]
    fn system_prompt_carries_the_contract() {
        let contract = OutcomeContract {
            description: "make the tests pass".into(),
            checks: vec![super::super::contract::ContractCheck {
                name: "tests".into(),
                command: "cargo test -p demo".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 300,
            }],
        };
        let p = system_prompt(&contract);
        assert!(p.contains("cargo test -p demo"));
        assert!(p.contains("STOP calling tools"));
    }

    #[test]
    fn preview_truncates_on_char_boundary() {
        assert_eq!(preview("short", 10), "short");
        let long = "é".repeat(300);
        let p = preview(&long, 5);
        assert!(p.ends_with('') && p.chars().count() <= 4);
    }
}