car-server-core 0.24.1

Transport-neutral library for the CAR daemon JSON-RPC dispatcher (used by car-server and tokhn-daemon)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
//! Outcome contracts — the verifiable definition of "done" for a coder session.
//!
//! A contract is a set of shell commands that must pass inside the worktree.
//! It is derived from the user's intent by a model (with a bounded repair loop
//! mirroring `car-builder`: generation is an injected closure, so tests run
//! without inference) and then becomes the trust boundary for the whole
//! session: whatever engine did the work — the native loop or an external CLI
//! — the runtime re-runs the checks itself before asking for merge approval.

use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::future::Future;

use super::session::{CoderEventKind, EventSink};
use super::shell_tool::WorktreeExecutor;

fn default_true() -> bool {
    true
}

fn default_check_timeout() -> u64 {
    120
}

/// The verifiable definition of done for a coding session.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct OutcomeContract {
    /// Human summary of what success means.
    pub description: String,
    /// Checks that must all pass. Evaluated through the policy-gated shell
    /// tool, so a malicious "check" cannot do what the agent itself couldn't.
    pub checks: Vec<ContractCheck>,
}

/// One acceptance check: a shell command run at the worktree root.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ContractCheck {
    /// Short, unique label ("tests_pass", "file_created").
    pub name: String,
    /// Command run via the worktree shell tool.
    pub command: String,
    /// Require exit code 0 (default true).
    #[serde(default = "default_true")]
    pub expect_exit_zero: bool,
    /// Additionally require this substring in the combined output.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub output_contains: Option<String>,
    /// Per-check timeout (default 120s; the shell tool clamps further).
    #[serde(default = "default_check_timeout")]
    pub timeout_secs: u64,
}

/// Result of evaluating one [`ContractCheck`].
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct CheckResult {
    pub name: String,
    pub passed: bool,
    /// None when the command could not run at all (spawn/policy failure).
    pub exit_code: Option<i64>,
    /// Tail of combined stdout+stderr — enough for repair prompts and the UI.
    pub output_tail: String,
    pub duration_ms: u64,
}

impl OutcomeContract {
    /// Structural problems that make a contract unusable. Empty = valid.
    ///
    /// Beyond pure structure (empty/duplicate names, assertion-less checks)
    /// this also rejects two failure modes seen live from small local models
    /// (issue #168 follow-up): the prompt's literal placeholder name leaking
    /// through verbatim, and "toolchain-only" no-op commands like
    /// `cargo --version` that prove nothing about the change. Both pass the
    /// structural checks but make a contract that gates nothing, so they're
    /// surfaced as validation issues to drive the repair loop rather than
    /// silently becoming the trust boundary.
    pub fn validate(&self) -> Vec<String> {
        let mut issues = Vec::new();
        if self.checks.is_empty() {
            issues.push("contract has no checks — at least one is required".to_string());
        }
        let mut seen = std::collections::HashSet::new();
        for (i, c) in self.checks.iter().enumerate() {
            let name = c.name.trim();
            if name.is_empty() {
                issues.push(format!("check #{i} has an empty name"));
            }
            if name == "unique_snake_case_label" {
                issues.push(format!(
                    "check #{i} kept the literal placeholder name \
                     'unique_snake_case_label' — give it a real descriptive label"
                ));
            }
            if c.command.trim().is_empty() {
                issues.push(format!("check '{}' has an empty command", c.name));
            } else if is_toolchain_only(c.command.trim()) {
                issues.push(format!(
                    "check '{}' runs a toolchain-only no-op (`{}`) that verifies the \
                     tool is installed, not the task — replace it with a command that \
                     exercises the actual change",
                    c.name,
                    c.command.trim()
                ));
            }
            if !seen.insert(name.to_string()) {
                issues.push(format!("duplicate check name '{}'", c.name));
            }
            if !c.expect_exit_zero && c.output_contains.is_none() {
                issues.push(format!(
                    "check '{}' asserts nothing (expect_exit_zero=false and no output_contains)",
                    c.name
                ));
            }
        }
        issues
    }

    /// Render for prompts and CLI display.
    pub fn render(&self) -> String {
        let mut out = format!("{}\nChecks:\n", self.description.trim());
        for c in &self.checks {
            out.push_str(&format!("- {}: `{}`", c.name, c.command));
            let mut expects = Vec::new();
            if c.expect_exit_zero {
                expects.push("exit 0".to_string());
            }
            if let Some(s) = &c.output_contains {
                expects.push(format!("output contains {s:?}"));
            }
            if !expects.is_empty() {
                out.push_str(&format!(" (expects {})", expects.join(", ")));
            }
            out.push('\n');
        }
        out
    }
}

/// True when a command only probes that a build tool is installed (e.g.
/// `cargo --version`, `rustc --version`, `node -v`) — it proves nothing about
/// the task. Conservative by design: it only fires on a bare
/// `<tool> --version` / `-V` / `--help` / `-v` invocation with no other
/// subcommand or shell composition, so real checks like `cargo run -- --version`,
/// `cargo build`, or `cargo test --version-of-something` are never flagged.
fn is_toolchain_only(command: &str) -> bool {
    // Any shell composition means it's doing more than a bare version probe.
    if command.contains("&&")
        || command.contains("||")
        || command.contains('|')
        || command.contains(';')
        || command.contains('\n')
    {
        return false;
    }
    let tokens: Vec<&str> = command.split_whitespace().collect();
    // Expect exactly `<tool> <version-or-help-flag>`. Anything longer (e.g.
    // `cargo run -- --version`, `cargo build`) has a subcommand and is real.
    let [tool, flag] = tokens.as_slice() else {
        return false;
    };
    const TOOLS: &[&str] = &[
        "cargo", "rustc", "rustup", "node", "npm", "npx", "yarn", "pnpm", "python", "python3",
        "pip", "pip3", "go", "java", "javac", "ruby", "gem", "dotnet", "deno", "bun", "tsc", "gcc",
        "clang", "make", "cmake",
    ];
    const FLAGS: &[&str] = &["--version", "-V", "-v", "--help", "-h", "version"];
    TOOLS.contains(tool) && FLAGS.contains(flag)
}

/// Build the contract-derivation prompt. `issues` carries repair feedback from
/// a prior failed attempt (car-builder pattern).
fn build_contract_prompt(intent: &str, repo_summary: &str, issues: &[String]) -> String {
    let mut p = format!(
        "You are deriving an OUTCOME CONTRACT for a coding task: a small set of shell \
         commands that objectively verify the task is done. The commands run at the root of a \
         fresh git checkout of the repository, non-interactively, with no TTY.\n\n\
         Task intent:\n{intent}\n\n\
         Repository summary:\n{repo_summary}\n\n\
         Respond with ONLY a JSON object, no prose, no markdown fences, in this shape:\n\
         {{\n  \"description\": \"one-sentence definition of done\",\n  \"checks\": [\n    \
         {{\"name\": \"unique_snake_case_label\", \"command\": \"shell command\", \
         \"expect_exit_zero\": true, \"output_contains\": null, \"timeout_secs\": 120}}\n  ]\n}}\n\n\
         Rules:\n\
         - 1 to 5 checks. Each must verify THE TASK ITSELF, not just that the toolchain works \
           (e.g. `rustc --version` or `cargo --version` prove nothing about the change).\n\
         - At least one check should exercise the actual new behaviour the intent describes \
           (run the program/test that the change affects).\n\
         - `name` must be a real, descriptive snake_case label unique within the contract — \
           never the literal placeholder `unique_snake_case_label`.\n\
         - Every command must run non-interactively and deterministically (no prompts, no \
           watchers, no servers that don't exit). Use the repo's own build/test commands when \
           the summary reveals them — a build that must compile the change is a strong check.\n\
         - `expect_exit_zero: true` (the default) is usually enough. Only set `output_contains` \
           to a substring you are CERTAIN will appear verbatim in stdout/stderr; if unsure, \
           leave it null. Do NOT invent example output or placeholder values.\n\
         - Never use git push, network access, sudo, or anything destructive outside the \
           checkout. Timeouts are in seconds; keep them realistic for a build.\n"
    );
    if !issues.is_empty() {
        p.push_str("\nYour previous attempt FAILED validation with these issues — fix them:\n");
        for i in issues {
            p.push_str(&format!("- {i}\n"));
        }
    }
    p
}

/// Extract the first JSON object from model output, tolerating code fences and
/// surrounding prose.
pub(crate) fn extract_json_object(text: &str) -> Result<Value, String> {
    let start = text.find('{').ok_or("no JSON object found in output")?;
    let end = text.rfind('}').ok_or("no closing brace found in output")?;
    if end < start {
        return Err("malformed JSON object in output".to_string());
    }
    serde_json::from_str(&text[start..=end]).map_err(|e| format!("invalid JSON: {e}"))
}

/// Derive a contract from `intent` via the injected `generate` closure, with a
/// bounded validate→repair loop.
pub async fn derive_contract<F, Fut>(
    generate: F,
    intent: &str,
    repo_summary: &str,
    max_attempts: u32,
) -> Result<OutcomeContract, String>
where
    F: Fn(String) -> Fut + Send + Sync,
    Fut: Future<Output = Result<String, String>> + Send,
{
    let max = max_attempts.max(1);
    let mut issues: Vec<String> = Vec::new();
    let mut last_err = String::new();

    for _ in 0..max {
        let prompt = build_contract_prompt(intent, repo_summary, &issues);
        let text = match generate(prompt).await {
            Ok(t) => t,
            Err(e) => {
                // Transient model/transport failure — retry with the same prompt.
                last_err = format!("generation failed: {e}");
                continue;
            }
        };
        let value = match extract_json_object(&text) {
            Ok(v) => v,
            Err(e) => {
                issues = vec![format!("output did not parse: {e}. Return ONLY the JSON object.")];
                last_err = issues.join("; ");
                continue;
            }
        };
        let contract: OutcomeContract = match serde_json::from_value(value) {
            Ok(c) => c,
            Err(e) => {
                issues = vec![format!("JSON did not match the contract schema: {e}")];
                last_err = issues.join("; ");
                continue;
            }
        };
        let problems = contract.validate();
        if problems.is_empty() {
            return Ok(contract);
        }
        last_err = problems.join("; ");
        issues = problems;
    }
    Err(format!(
        "could not derive a valid outcome contract after {max} attempts: {last_err}"
    ))
}

/// Run every check through the worktree shell tool and report results.
///
/// All checks run even after a failure — repair prompts and the UI want the
/// full picture, and checks are independent by construction.
pub async fn evaluate_contract(
    contract: &OutcomeContract,
    executor: &WorktreeExecutor,
    sink: &EventSink,
) -> Vec<CheckResult> {
    let mut results = Vec::with_capacity(contract.checks.len());
    for check in &contract.checks {
        sink.emit(CoderEventKind::CheckStarted {
            name: check.name.clone(),
        });
        let started = std::time::Instant::now();
        let outcome = executor
            .run_shell(&check.command, Some(check.timeout_secs))
            .await;
        let duration_ms = started.elapsed().as_millis() as u64;

        let result = match outcome {
            Ok(v) => {
                let exit_code = v.get("exit_code").and_then(Value::as_i64);
                let output = v.get("output").and_then(Value::as_str).unwrap_or_default();
                let timed_out = v
                    .get("timed_out")
                    .and_then(Value::as_bool)
                    .unwrap_or(false);
                let exit_ok = !check.expect_exit_zero || exit_code == Some(0);
                let contains_ok = check
                    .output_contains
                    .as_deref()
                    .map(|needle| output.contains(needle))
                    .unwrap_or(true);
                CheckResult {
                    name: check.name.clone(),
                    passed: exit_ok && contains_ok && !timed_out,
                    exit_code,
                    output_tail: super::shell_tool::tail(output, 4 * 1024),
                    duration_ms,
                }
            }
            Err(e) => CheckResult {
                name: check.name.clone(),
                passed: false,
                exit_code: None,
                output_tail: format!("check failed to run: {e}"),
                duration_ms,
            },
        };
        sink.emit(CoderEventKind::CheckCompleted {
            result: result.clone(),
        });
        results.push(result);
    }
    results
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicUsize, Ordering};

    const VALID: &str = r#"{
        "description": "file exists",
        "checks": [{"name": "exists", "command": "test -f x.txt"}]
    }"#;

    #[test]
    fn prompt_steers_toward_verifying_the_task_and_real_labels() {
        let p = build_contract_prompt(
            "add a --version flag",
            "Top-level entries: Cargo.toml, src\nBuild systems detected: Rust (cargo)",
            &[],
        );
        // Carries the task and repo orientation.
        assert!(p.contains("add a --version flag"));
        assert!(p.contains("Rust (cargo)"));
        // Steers away from toolchain-only checks and placeholder labels.
        assert!(p.contains("verify THE TASK ITSELF"));
        assert!(p.contains("rustc --version"), "names the toolchain-only anti-pattern");
        assert!(p.contains("never the literal placeholder"));
        // Guards the common small-model failure modes.
        assert!(p.contains("non-interactively"));
        assert!(p.contains("CERTAIN will appear"), "output_contains caution present");
        assert!(p.contains("no markdown fences"));
    }

    #[test]
    fn repair_prompt_appends_prior_issues() {
        let p = build_contract_prompt("t", "r", &["check 'a' has an empty command".into()]);
        assert!(p.contains("FAILED validation"));
        assert!(p.contains("empty command"));
    }

    #[tokio::test]
    async fn derives_on_first_valid_attempt() {
        let c = derive_contract(|_p| async { Ok::<_, String>(VALID.into()) }, "make x", "repo", 3)
            .await
            .unwrap();
        assert_eq!(c.checks.len(), 1);
        assert!(c.checks[0].expect_exit_zero, "default applies");
        assert_eq!(c.checks[0].timeout_secs, 120);
    }

    #[tokio::test]
    async fn repairs_fenced_and_chatty_output() {
        let fenced = format!("Sure! Here is the contract:\n```json\n{VALID}\n```");
        let c = derive_contract(
            |_p| {
                let text = fenced.clone();
                async move { Ok::<_, String>(text) }
            },
            "x",
            "r",
            3,
        )
        .await
        .unwrap();
        assert_eq!(c.checks[0].name, "exists");
    }

    #[tokio::test]
    async fn invalid_then_repaired() {
        let calls = AtomicUsize::new(0);
        let c = derive_contract(
            |prompt: String| {
                let n = calls.fetch_add(1, Ordering::SeqCst) + 1;
                async move {
                    if n == 1 {
                        Ok::<_, String>(r#"{"description": "no checks", "checks": []}"#.into())
                    } else {
                        assert!(prompt.contains("FAILED validation"), "repair prompt carries issues");
                        Ok(VALID.into())
                    }
                }
            },
            "x",
            "r",
            3,
        )
        .await
        .unwrap();
        assert_eq!(c.checks.len(), 1);
    }

    #[tokio::test]
    async fn gives_up_with_error_after_max() {
        let err = derive_contract(
            |_p| async { Ok::<_, String>("not json at all".into()) },
            "x",
            "r",
            2,
        )
        .await
        .unwrap_err();
        assert!(err.contains("after 2 attempts"), "{err}");
    }

    #[test]
    fn is_toolchain_only_flags_bare_version_probes_only() {
        // Bare version/help probes of build tools — these gate nothing.
        for c in [
            "cargo --version",
            "cargo -V",
            "rustc --version",
            "node -v",
            "npm --version",
            "python3 --version",
            "go version",
            "make --help",
        ] {
            assert!(is_toolchain_only(c), "should flag `{c}`");
        }
        // Real checks that exercise the change must NOT be flagged.
        for c in [
            "cargo build",
            "cargo test",
            "cargo run -- --version",
            "cargo run --release -- --version",
            "./target/debug/greeter --version",
            "cargo --version && cargo build",
            "test -f src/main.rs",
            "grep -q version Cargo.toml",
            "rustc src/main.rs -o /tmp/x",
        ] {
            assert!(!is_toolchain_only(c), "should NOT flag `{c}`");
        }
    }

    #[test]
    fn validate_rejects_toolchain_only_and_placeholder_name() {
        let c = OutcomeContract {
            description: "d".into(),
            checks: vec![ContractCheck {
                // The literal placeholder leaking through, paired with a
                // toolchain-only command — both seen live from a 1.7B model.
                name: "unique_snake_case_label".into(),
                command: "cargo --version".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 120,
            }],
        };
        let issues = c.validate();
        assert!(
            issues.iter().any(|i| i.contains("placeholder name")),
            "{issues:?}"
        );
        assert!(
            issues.iter().any(|i| i.contains("toolchain-only no-op")),
            "{issues:?}"
        );
    }

    #[tokio::test]
    async fn derive_repairs_a_toolchain_only_first_attempt() {
        let calls = AtomicUsize::new(0);
        let toolchain_only = r#"{"description":"v","checks":[
            {"name":"unique_snake_case_label","command":"cargo --version"}]}"#;
        let real = r#"{"description":"v","checks":[
            {"name":"version_flag_prints","command":"cargo run -- --version"}]}"#;
        let c = derive_contract(
            |prompt: String| {
                let n = calls.fetch_add(1, Ordering::SeqCst) + 1;
                async move {
                    if n == 1 {
                        Ok::<_, String>(toolchain_only.into())
                    } else {
                        // Repair prompt must carry both rejections.
                        assert!(prompt.contains("toolchain-only no-op"), "{prompt}");
                        assert!(prompt.contains("placeholder name"), "{prompt}");
                        Ok(real.into())
                    }
                }
            },
            "add a --version flag",
            "Rust (cargo)",
            3,
        )
        .await
        .unwrap();
        assert_eq!(c.checks[0].command, "cargo run -- --version");
        assert_eq!(calls.load(Ordering::SeqCst), 2, "took exactly one repair");
    }

    #[test]
    fn validate_catches_empty_and_duplicate_and_assertless() {
        let c = OutcomeContract {
            description: "d".into(),
            checks: vec![
                ContractCheck {
                    name: "a".into(),
                    command: "true".into(),
                    expect_exit_zero: false,
                    output_contains: None,
                    timeout_secs: 5,
                },
                ContractCheck {
                    name: "a".into(),
                    command: "".into(),
                    expect_exit_zero: true,
                    output_contains: None,
                    timeout_secs: 5,
                },
            ],
        };
        let issues = c.validate();
        assert!(issues.iter().any(|i| i.contains("asserts nothing")));
        assert!(issues.iter().any(|i| i.contains("empty command")));
        assert!(issues.iter().any(|i| i.contains("duplicate")));
    }

    #[tokio::test]
    async fn evaluate_passes_and_fails_checks_in_a_real_dir() {
        let dir = tempfile::tempdir().unwrap();
        std::fs::write(dir.path().join("present.txt"), "hello needle").unwrap();
        let exec = WorktreeExecutor::new(dir.path());
        let sink = EventSink::test_sink();
        let contract = OutcomeContract {
            description: "d".into(),
            checks: vec![
                ContractCheck {
                    name: "exists".into(),
                    command: "test -f present.txt".into(),
                    expect_exit_zero: true,
                    output_contains: None,
                    timeout_secs: 10,
                },
                ContractCheck {
                    name: "content".into(),
                    command: "cat present.txt".into(),
                    expect_exit_zero: true,
                    output_contains: Some("needle".into()),
                    timeout_secs: 10,
                },
                ContractCheck {
                    name: "missing".into(),
                    command: "test -f absent.txt".into(),
                    expect_exit_zero: true,
                    output_contains: None,
                    timeout_secs: 10,
                },
            ],
        };
        let results = evaluate_contract(&contract, &exec, &sink).await;
        assert_eq!(results.len(), 3, "all checks run even after a failure");
        assert!(results[0].passed);
        assert!(results[1].passed);
        assert!(!results[2].passed);
        assert_eq!(results[2].exit_code, Some(1));
    }

    #[tokio::test]
    async fn evaluate_fails_on_missing_substring() {
        let dir = tempfile::tempdir().unwrap();
        let exec = WorktreeExecutor::new(dir.path());
        let sink = EventSink::test_sink();
        let contract = OutcomeContract {
            description: "d".into(),
            checks: vec![ContractCheck {
                name: "needle".into(),
                command: "echo haystack".into(),
                expect_exit_zero: true,
                output_contains: Some("needle".into()),
                timeout_secs: 10,
            }],
        };
        let results = evaluate_contract(&contract, &exec, &sink).await;
        assert!(!results[0].passed, "exit 0 but substring missing must fail");
        assert_eq!(results[0].exit_code, Some(0));
    }
}