agentwerk 0.1.8

A minimal Rust crate that gives any application agentic capabilities.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
//! How structured output flows from declaration to validated `AgentOutput`.
//!
//! `output_schema` turns the agent loop into a tiny state machine on top of
//! the regular run loop:
//!
//! ```text
//!   S0 (instruction) ──turn 1──▶ S1 (model text) ──validate─┐
//!//!                                 ┌─── valid ───────────────┘
//!//!                       AgentOutput { response: Some(value) }
//!
//!                                 ┌─── invalid ─────────────┐
//!                                 ▼                         │
//!                  S2 (text + validator correction) ──turn 2──▶ S1' ─▶ ...
//! ```
//!
//! The validator only fires at the natural end of turn — no tool calls, not
//! truncated, no pending peer messages. Tools, truncation, peer messages,
//! and idle wait all short-circuit *before* validation, so a schema agent
//! can interleave tool work freely; only its terminal text reply is parsed.
//!
//! Run with `cargo test --test structured_output -- --nocapture` to inspect
//! the per-turn snapshots.
//!
//! Mirrors the structure of `context_window_events.rs`: a file-level
//! state-machine thesis, one snapshot test that walks every node, and
//! focused tests grouped by concern.
//!
//! - **Centerpiece** — the snapshot state machine.
//! - **A. Happy paths** — single-turn valid output, lenient code-fence handling.
//! - **B. Retry / failure** — corrective messages, schema violations, exhaustion.
//! - **C. Tools and end-conditions** — tools run first, truncation defers, guards skip.
//! - **D. Sub-agent boundary** — the propagation bug: registered, ad-hoc, and background.

use agentwerk::testutil::{
    text_response, tool_response, truncated_response, MockProvider, MockTool, TestHarness,
};
use agentwerk::{Agent, AgenticError, CompletionRequest, ContentBlock, Message, SpawnAgentTool};

fn answer_schema() -> serde_json::Value {
    serde_json::json!({
        "type": "object",
        "properties": { "answer": { "type": "integer" } },
        "required": ["answer"]
    })
}

const VALID_JSON: &str = r#"{"answer":42}"#;

fn report_schema() -> serde_json::Value {
    serde_json::json!({
        "type": "object",
        "properties": {
            "category": { "type": "string" },
            "priority": { "type": "string" },
            "summary": { "type": "string" },
            "findings": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "file":     { "type": "string" },
                        "line":     { "type": "integer" },
                        "severity": { "type": "string" }
                    },
                    "required": ["file", "line", "severity"]
                }
            }
        },
        "required": ["category", "priority", "summary", "findings"]
    })
}

const VALID_REPORT_JSON: &str = r#"{"category":"security","priority":"high","summary":"Two SQL-injection paths in the auth flow.","findings":[{"file":"src/auth.rs","line":47,"severity":"critical"},{"file":"src/db.rs","line":112,"severity":"high"}]}"#;

fn report_agent() -> Agent {
    Agent::new()
        .name("reviewer")
        .model_name("mock")
        .identity_prompt("You are a code reviewer. Reply with a structured report.")
        .behavior_prompt("")
        .output_schema(report_schema())
}

const INVALID_REPORT_JSON: &str = r#"{"category":"security","priority":"high","findings":[]}"#;

const S0_INITIAL: &str = "\
=== system ===
You are a code reviewer. Reply with a structured report.

IMPORTANT: You MUST return your final response as a single JSON value that \
conforms to the declared output schema. After using any tools needed to complete \
the task, your last message MUST be the JSON value, exactly once. Do not wrap it \
in markdown code fences. Do not include any text before or after the JSON.

=== messages[0] user ===
<environment>
Working directory: <WORKING_DIRECTORY>
Platform: <PLATFORM>
OS version: <OS_VERSION>
Date: <DATE>
</environment>

=== messages[1] user ===
Review the auth module.
";

const S1_AFTER_INVALID_REPLY: &str = "\
=== system ===
You are a code reviewer. Reply with a structured report.

IMPORTANT: You MUST return your final response as a single JSON value that \
conforms to the declared output schema. After using any tools needed to complete \
the task, your last message MUST be the JSON value, exactly once. Do not wrap it \
in markdown code fences. Do not include any text before or after the JSON.

=== messages[0] user ===
<environment>
Working directory: <WORKING_DIRECTORY>
Platform: <PLATFORM>
OS version: <OS_VERSION>
Date: <DATE>
</environment>

=== messages[1] user ===
Review the auth module.

=== messages[2] assistant ===
{\"category\":\"security\",\"priority\":\"high\",\"findings\":[]}

=== messages[3] user ===
Your last reply did not match the required output schema. You MUST reply with a \
single JSON value conforming to the schema, with no surrounding text and no code \
fences.

Validator said: Schema validation error at summary: missing required field
";

const S2_AFTER_VALID_REPLY: &str = "\
=== system ===
You are a code reviewer. Reply with a structured report.

IMPORTANT: You MUST return your final response as a single JSON value that \
conforms to the declared output schema. After using any tools needed to complete \
the task, your last message MUST be the JSON value, exactly once. Do not wrap it \
in markdown code fences. Do not include any text before or after the JSON.

=== messages[0] user ===
<environment>
Working directory: <WORKING_DIRECTORY>
Platform: <PLATFORM>
OS version: <OS_VERSION>
Date: <DATE>
</environment>

=== messages[1] user ===
Review the auth module.

=== messages[2] assistant ===
{\"category\":\"security\",\"priority\":\"high\",\"findings\":[]}

=== messages[3] user ===
Your last reply did not match the required output schema. You MUST reply with a \
single JSON value conforming to the schema, with no surrounding text and no code \
fences.

Validator said: Schema validation error at summary: missing required field

=== messages[4] assistant ===
{\"category\":\"security\",\"priority\":\"high\",\"summary\":\"Two SQL-injection paths in the auth flow.\",\"findings\":[{\"file\":\"src/auth.rs\",\"line\":47,\"severity\":\"critical\"},{\"file\":\"src/db.rs\",\"line\":112,\"severity\":\"high\"}]}
";

#[tokio::test]
async fn state_machine_advances_invalid_then_valid() {
    // Two turns:
    //   turn 1 → INVALID_REPORT_JSON  → schema mismatch → push corrective msg
    //   turn 2 → VALID_REPORT_JSON    → validates       → terminate
    let provider = MockProvider::new(vec![
        text_response(INVALID_REPORT_JSON),
        text_response(VALID_REPORT_JSON),
    ]);

    let harness = TestHarness::new(provider);
    let output = harness
        .run_agent(&report_agent(), "Review the auth module.")
        .await
        .unwrap();

    assert_eq!(
        harness.provider().request_count(),
        2,
        "two turns: invalid + valid"
    );
    assert_eq!(output.response_raw, VALID_REPORT_JSON);
    assert_eq!(
        output.response,
        Some(serde_json::from_str::<serde_json::Value>(VALID_REPORT_JSON).unwrap()),
    );

    let reqs = harness.provider().requests.lock().unwrap();
    let state = |i: usize| canonicalize(&render(&reqs[i]));
    assert_eq!(state(0), S0_INITIAL);
    assert_eq!(state(1), S1_AFTER_INVALID_REPLY);

    // Loop terminated on turn 2 without a third request. Reconstruct what req[2] would have been
    // (req[1]'s context plus the final assistant reply) to snapshot the end-to-end state machine.
    let mut terminal_messages = reqs[1].messages.clone();
    terminal_messages.push(Message::Assistant {
        content: vec![ContentBlock::Text {
            text: output.response_raw.clone(),
        }],
    });
    let terminal = render_conversation(&reqs[1].system_prompt, &terminal_messages);
    assert_eq!(canonicalize(&terminal), S2_AFTER_VALID_REPLY);
}

#[tokio::test]
async fn schema_agent_does_not_inject_synthetic_tool() {
    // The rewrite removed `StructuredOutputTool` from the registry and the
    // forced `tool_choice = Specific("…")` that came with it; validation
    // moved into `try_finish` against the model's natural text reply. Every
    // test in this file goes green even if a future commit silently puts a
    // synthetic tool back — the model would just call it and the validated
    // value would still flow. Pin the contract here so a regression to the
    // old "fake tool" approach fails immediately and loudly.
    let provider = MockProvider::new(vec![text_response(VALID_JSON)]);
    let harness = TestHarness::new(provider);
    harness.run_agent(&schema_agent(), "go").await.unwrap();

    let req = harness.provider().last_request().unwrap();
    let advertised: Vec<&String> = req.tools.iter().map(|t| &t.name).collect();
    assert!(
        req.tools.is_empty(),
        "schema agent must advertise no tool to the LLM, got: {advertised:?}"
    );
    assert!(
        req.tool_choice.is_none(),
        "schema agent must not force a tool_choice (the model speaks JSON freely)"
    );
    assert!(
        req.system_prompt
            .contains("You MUST return your final response as a single JSON value"),
        "schema contract must be carried by the system prompt — got:\n{}",
        req.system_prompt,
    );
}

#[tokio::test]
async fn schema_agent_with_user_tools_still_advertises_no_synthetic_tool() {
    // Same invariant holds when the user added their own tools — the schema
    // contract lives in the prompt, not the registry, regardless of what
    // else is in the registry. We don't actually use the tool here; we just
    // need to prove the user tool is the *only* one advertised.
    let provider = MockProvider::new(vec![text_response(VALID_JSON)]);
    let agent = schema_agent().tool(MockTool::new("lookup", true, "ok"));

    let harness = TestHarness::new(provider);
    harness.run_agent(&agent, "go").await.unwrap();

    let req = harness.provider().last_request().unwrap();
    let names: Vec<&String> = req.tools.iter().map(|t| &t.name).collect();
    assert_eq!(
        names,
        vec![&"lookup".to_string()],
        "only the user-registered tool should be advertised, got: {names:?}"
    );
    assert!(req.tool_choice.is_none());
}

#[tokio::test]
async fn valid_json_terminates_in_one_turn() {
    // The whole point of going text-based: a valid reply terminates the loop
    // immediately. No follow-up round-trip.
    let provider = MockProvider::new(vec![text_response(VALID_JSON)]);
    let harness = TestHarness::new(provider);
    let output = harness
        .run_agent(&schema_agent(), "What is the answer?")
        .await
        .unwrap();

    assert_eq!(harness.provider().request_count(), 1);
    assert_eq!(output.response, Some(serde_json::json!({"answer": 42})));
    assert_eq!(output.response_raw, VALID_JSON);
}

#[tokio::test]
async fn code_fenced_json_accepted_leniently() {
    // The most common model mistake: wrapping JSON in a ```json fence.
    // strip_code_fences peels the wrapper so we don't burn a retry.
    let fenced = "```json\n{\"answer\":42}\n```";
    let provider = MockProvider::new(vec![text_response(fenced)]);
    let harness = TestHarness::new(provider);
    let output = harness
        .run_agent(&schema_agent(), "What is the answer?")
        .await
        .unwrap();

    assert_eq!(harness.provider().request_count(), 1);
    assert_eq!(output.response, Some(serde_json::json!({"answer": 42})));
}

#[tokio::test]
async fn valid_complex_report_terminates_in_one_turn() {
    // Realistic case: a structured report with nested objects in an array and
    // mixed primitive types. Proves the loop handles non-trivial JSON the same
    // way it handles the toy `{answer:42}` — a single round-trip, with the
    // parsed value reachable via `output.response` and the raw text preserved
    // verbatim in `output.response_raw`.
    let provider = MockProvider::new(vec![text_response(VALID_REPORT_JSON)]);
    let harness = TestHarness::new(provider);
    let output = harness
        .run_agent(&report_agent(), "Review the auth module.")
        .await
        .unwrap();

    assert_eq!(harness.provider().request_count(), 1);
    assert_eq!(output.response_raw, VALID_REPORT_JSON);

    // Walk into the parsed value to prove every layer survived intact —
    // not just that the top-level keys are present.
    let value = output.response.expect("response must be Some");
    assert_eq!(value["category"], "security");
    assert_eq!(value["priority"], "high");

    let findings = value["findings"].as_array().expect("findings is an array");
    assert_eq!(findings.len(), 2);
    assert_eq!(findings[0]["file"], "src/auth.rs");
    assert_eq!(findings[0]["line"], 47);
    assert_eq!(findings[0]["severity"], "critical");
    assert_eq!(findings[1]["file"], "src/db.rs");
    assert_eq!(findings[1]["line"], 112);
}

#[tokio::test]
async fn non_json_reply_triggers_retry_with_validator_detail() {
    // Loop must surface the parse error to the model so it can self-correct.
    // The next user message bundles both the static retry instruction and a
    // dynamic "Validator said: …" line naming the failure.
    let provider = MockProvider::new(vec![
        text_response("the answer is 42"),
        text_response(VALID_JSON),
    ]);
    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&schema_agent(), "go").await.unwrap();

    let req2 = &harness.provider().requests.lock().unwrap()[1];
    let last_user = last_user_text(req2).expect("expected a user message in turn 2 input");
    assert!(
        last_user.contains("did not match the required output schema"),
        "expected the static retry copy in: {last_user}"
    );
    assert!(
        last_user.contains("Validator said:"),
        "expected validator detail in: {last_user}"
    );
    assert_eq!(output.response, Some(serde_json::json!({"answer": 42})));
}

#[tokio::test]
async fn schema_violation_triggers_retry() {
    // Same retry mechanism, but the failure surfaces from validate_value
    // (well-formed JSON, wrong shape) rather than serde's parser.
    let provider = MockProvider::new(vec![
        text_response(r#"{"answer":"not a number"}"#),
        text_response(VALID_JSON),
    ]);
    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&schema_agent(), "go").await.unwrap();

    let req2 = &harness.provider().requests.lock().unwrap()[1];
    let last_user = last_user_text(req2).unwrap();
    assert!(
        last_user.contains("Validator said:") && last_user.contains("answer"),
        "expected validator detail naming the field: {last_user}"
    );
    assert_eq!(output.response, Some(serde_json::json!({"answer": 42})));
}

#[tokio::test]
async fn validator_path_reaches_into_array_items() {
    // The validator walks recursively into nested objects and array items,
    // accumulating a dotted path so the retry message points the model at the
    // exact field that broke. Here `findings[0].line` is a string instead of
    // an integer — the corrective message must surface that path so the model
    // can fix the right field on its next attempt.
    let bad = r#"{"category":"perf","priority":"low","summary":"slow query","findings":[{"file":"src/db.rs","line":"forty-two","severity":"low"}]}"#;
    let provider = MockProvider::new(vec![text_response(bad), text_response(VALID_REPORT_JSON)]);
    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&report_agent(), "review").await.unwrap();

    let req2 = &harness.provider().requests.lock().unwrap()[1];
    let last_user = last_user_text(req2).unwrap();
    assert!(
        last_user.contains("findings.[0].line"),
        "validator detail must name the deep path, got: {last_user}"
    );
    assert!(
        last_user.contains("expected integer"),
        "validator detail must name the type mismatch, got: {last_user}"
    );
    assert!(output.response.is_some());
}

#[tokio::test]
async fn missing_required_field_at_depth_triggers_retry() {
    // Required-field check applies inside nested array items too. Drop
    // `severity` from the first finding — the validator must report the
    // missing field with its full path, not just say "something is off".
    let bad = r#"{"category":"bug","priority":"med","summary":"flaky test","findings":[{"file":"src/x.rs","line":10}]}"#;
    let provider = MockProvider::new(vec![text_response(bad), text_response(VALID_REPORT_JSON)]);
    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&report_agent(), "review").await.unwrap();

    let req2 = &harness.provider().requests.lock().unwrap()[1];
    let last_user = last_user_text(req2).unwrap();
    assert!(
        last_user.contains("findings.[0].severity"),
        "expected nested path to the missing field, got: {last_user}"
    );
    assert!(
        last_user.contains("missing required field"),
        "expected the required-field message, got: {last_user}"
    );
    assert!(output.response.is_some());
}

#[tokio::test]
async fn retry_exhaustion_returns_schema_retry_exhausted() {
    // Cap retries at 2 → after 3 failures, the loop bails with the original
    // limit attached to the error.
    let provider = MockProvider::new(vec![
        text_response("nope"),
        text_response("still nope"),
        text_response("never going to be json"),
    ]);
    let agent = schema_agent().max_schema_retries(2);
    let harness = TestHarness::new(provider);
    let err = harness.run_agent(&agent, "go").await.unwrap_err();

    assert!(matches!(
        err,
        AgenticError::SchemaRetryExhausted { retries: 2 }
    ));
}

#[tokio::test]
async fn tools_run_first_then_validation_at_natural_end() {
    // Schema validation must NOT run while the loop is mid-tool-use. The
    // model's terminal text reply is the only thing parsed.
    let provider = MockProvider::new(vec![
        tool_response("lookup", "c1", serde_json::json!({"q": "answer"})),
        text_response(VALID_JSON),
    ]);
    let agent = schema_agent().tool(MockTool::new("lookup", true, "answer=42"));

    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&agent, "go").await.unwrap();

    // Tool ran on turn 1; validation on turn 2; total 2 requests.
    assert_eq!(harness.provider().request_count(), 2);
    assert_eq!(output.response, Some(serde_json::json!({"answer": 42})));
}

#[tokio::test]
async fn validation_deferred_through_truncation() {
    // Truncated turns push MAX_TOKENS_CONTINUATION instead of try_finish'ing,
    // so the schema retry counter doesn't move.
    let provider = MockProvider::new(vec![
        truncated_response("partial JSON…"),
        text_response(VALID_JSON),
    ]);
    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&schema_agent(), "go").await.unwrap();

    assert_eq!(harness.provider().request_count(), 2);
    assert_eq!(output.response, Some(serde_json::json!({"answer": 42})));

    // Turn 2's input must contain the continuation prompt, not the schema
    // retry copy — proving the deferral happened.
    let req2 = &harness.provider().requests.lock().unwrap()[1];
    let last_user = last_user_text(req2).unwrap();
    assert!(
        last_user.contains("cut off"),
        "expected MAX_TOKENS_CONTINUATION, got: {last_user}"
    );
    assert!(
        !last_user.contains("required output schema"),
        "schema retry should not fire on truncation, got: {last_user}"
    );
}

#[tokio::test]
async fn cancel_before_any_reply_skips_validation() {
    // Pre-cancelled run: check_guards fires on the first iteration; finish_early
    // returns AgentStatus::Cancelled with response=None and never calls validate_value.
    let provider = MockProvider::new(vec![text_response("would-be JSON if we got here")]);
    let harness = TestHarness::new(provider);
    harness.cancel();
    let output = harness.run_agent(&schema_agent(), "go").await.unwrap();

    use agentwerk::AgentStatus;
    assert_eq!(output.status, AgentStatus::Cancelled);
    assert_eq!(output.response, None);
    // No request was sent at all (guard fires before the first turn body).
    assert_eq!(harness.provider().request_count(), 0);
}

#[tokio::test]
async fn turn_limit_skips_validation() {
    // Same shape as cancel: guard short-circuits on turn 2 before try_finish
    // can validate (max_turns(1) means turn=1 is the last iteration; the guard
    // catches turn=2 at the top of the loop).
    let provider = MockProvider::new(vec![
        text_response("not json"),
        text_response("still not json"),
    ]);
    let agent = schema_agent().max_turns(1);
    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&agent, "go").await.unwrap();

    use agentwerk::AgentStatus;
    assert_eq!(output.status, AgentStatus::TurnLimitReached { limit: 1 });
    assert_eq!(output.response, None);
}

#[tokio::test]
async fn sub_agent_with_schema_returns_json_in_tool_result() {
    // Registered sub-agent declares a schema. Its validated JSON must reach
    // the parent's tool_result content verbatim — including the nested
    // findings array, so we know the boundary doesn't accidentally re-pretty
    // or strip whitespace.
    let child = Agent::new()
        .name("reviewer")
        .model_name("mock")
        .identity_prompt("You are a code reviewer. Reply with a structured report.")
        .behavior_prompt("")
        .output_schema(report_schema());

    let parent = Agent::new()
        .name("orchestrator")
        .model_name("mock")
        .identity_prompt("Coordinate.")
        .behavior_prompt("")
        .sub_agents([child]);

    let provider = MockProvider::new(vec![
        // parent turn 1: spawn the registered reviewer
        tool_response(
            "spawn_agent",
            "sa1",
            serde_json::json!({
                "description": "ask reviewer",
                "instruction": "Review the auth module.",
                "agent": "reviewer"
            }),
        ),
        // child turn 1: structured report → validates → terminates immediately
        text_response(VALID_REPORT_JSON),
        // parent turn 2: wraps up
        text_response("done"),
    ]);

    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&parent, "go").await.unwrap();
    assert_eq!(output.response_raw, "done");

    // The parent's input on turn 2 must contain a tool_result whose content
    // is the child's JSON answer byte-for-byte — proving the boundary
    // forwarded the full nested structure without re-formatting.
    let req2 = &harness.provider().requests.lock().unwrap()[2];
    let tool_result = last_tool_result_content(req2).expect("expected a tool_result");
    assert_eq!(tool_result, VALID_REPORT_JSON);

    // And the parsed view round-trips: parent reads what child wrote.
    let parsed: serde_json::Value = serde_json::from_str(&tool_result).unwrap();
    assert_eq!(parsed["findings"][1]["file"], "src/db.rs");
}

#[tokio::test]
async fn ad_hoc_spawned_agent_declares_schema_via_overrides() {
    // No registered sub-agent. The parent passes output_schema in the spawn
    // tool's JSON args; apply_overrides wires it onto the ad-hoc child.
    let parent = Agent::new()
        .name("orchestrator")
        .model_name("mock")
        .identity_prompt("")
        .behavior_prompt("")
        .tool(SpawnAgentTool);

    let provider = MockProvider::new(vec![
        // parent turn 1: ad-hoc spawn with output_schema in args
        tool_response(
            "spawn_agent",
            "sa1",
            serde_json::json!({
                "description": "ad-hoc classifier",
                "instruction": "Reply with the answer.",
                "identity": "You answer with JSON.",
                "model": "mock",
                "output_schema": answer_schema(),
            }),
        ),
        // child turn 1: invalid → triggers schema retry inside the child
        text_response("just kidding"),
        // child turn 2: valid → terminates the child
        text_response(VALID_JSON),
        // parent turn 2: wraps up
        text_response("done"),
    ]);

    let harness = TestHarness::new(provider);
    let output = harness.run_agent(&parent, "go").await.unwrap();
    assert_eq!(output.response_raw, "done");

    // Total 4 requests: parent(2) + child(2). Confirms the child enforced
    // its schema and retried before terminating.
    assert_eq!(harness.provider().request_count(), 4);

    let req4 = &harness.provider().requests.lock().unwrap()[3];
    let tool_result = last_tool_result_content(req4).expect("expected a tool_result");
    assert_eq!(tool_result, VALID_JSON);
}

// (Background sub-agent + schema is covered by an inline test in
// `spawn_agent.rs` — that test has crate-private access to the command queue
// and can assert directly on the notification content. From here the queue
// is unreachable, so an integration test would only duplicate coverage
// while introducing tokio-spawn races between parent and child mock pops.)

fn schema_agent() -> Agent {
    Agent::new()
        .name("classifier")
        .model_name("mock")
        .identity_prompt("You answer with JSON.")
        .behavior_prompt("")
        .output_schema(answer_schema())
}

/// Render a CompletionRequest as plain text. Mirrors `context_window_events.rs`'s
/// helper of the same name — same shape so snapshots are visually comparable.
fn render(req: &CompletionRequest) -> String {
    render_conversation(&req.system_prompt, &req.messages)
}

/// Render a `(system_prompt, messages)` pair. Used by `render` for actual
/// requests and by the centerpiece test to synthesize the post-terminal
/// state — the loop doesn't send a request after the final valid reply, so
/// there's no `req[2]` to inspect; we reconstruct what it would have looked
/// like by appending the final assistant message to `req[1]`'s context.
fn render_conversation(system_prompt: &str, messages: &[Message]) -> String {
    let mut out = String::new();
    out.push_str("=== system ===\n");
    out.push_str(system_prompt);
    out.push('\n');
    for (i, msg) in messages.iter().enumerate() {
        let (role, body) = match msg {
            Message::System { content } => ("system", content.clone()),
            Message::User { content } => ("user", render_blocks(content)),
            Message::Assistant { content } => ("assistant", render_blocks(content)),
        };
        out.push_str(&format!("\n=== messages[{i}] {role} ===\n{body}\n"));
    }
    out
}

fn render_blocks(blocks: &[ContentBlock]) -> String {
    blocks
        .iter()
        .map(|b| match b {
            ContentBlock::Text { text } => text.clone(),
            ContentBlock::ToolUse { id, name, input } => {
                format!("[tool_use {id}] {name}({input})")
            }
            ContentBlock::ToolResult {
                tool_use_id,
                content,
                is_error,
            } => {
                let tag = if *is_error { "ERR" } else { "ok" };
                format!("[tool_result {tool_use_id} {tag}] {content}")
            }
        })
        .collect::<Vec<_>>()
        .join("\n")
}

/// Replace the four dynamic values inside the `<environment>` block (cwd,
/// platform, OS version, date) with stable placeholders so snapshots are
/// reproducible on any host.
fn canonicalize(rendered: &str) -> String {
    let mut out: Vec<String> = Vec::with_capacity(rendered.lines().count());
    let mut in_env = false;
    for line in rendered.lines() {
        match line {
            "<environment>" => {
                in_env = true;
                out.push(line.to_string());
            }
            "</environment>" => {
                in_env = false;
                out.push(line.to_string());
            }
            _ if in_env => out.push(replace_value_with_placeholder(line)),
            _ => out.push(line.to_string()),
        }
    }
    let mut joined = out.join("\n");
    if rendered.ends_with('\n') {
        joined.push('\n');
    }
    joined
}

fn replace_value_with_placeholder(line: &str) -> String {
    let Some(colon) = line.find(':') else {
        return line.to_string();
    };
    let key = &line[..colon];
    let placeholder = key.to_uppercase().replace(' ', "_");
    format!("{key}: <{placeholder}>")
}

/// Pull the text of the most recent `Message::User` entry — useful for tests
/// that want to check what the loop just told the model.
fn last_user_text(req: &CompletionRequest) -> Option<String> {
    req.messages.iter().rev().find_map(|m| match m {
        Message::User { content } => {
            let text: String = content
                .iter()
                .filter_map(|b| match b {
                    ContentBlock::Text { text } => Some(text.as_str()),
                    _ => None,
                })
                .collect::<Vec<_>>()
                .join("\n");
            (!text.is_empty()).then_some(text)
        }
        _ => None,
    })
}

/// Pull the content of the most recent `tool_result` block in the conversation.
/// Used by the sub-agent boundary tests to assert the child's JSON reached the
/// parent intact.
fn last_tool_result_content(req: &CompletionRequest) -> Option<String> {
    req.messages.iter().rev().find_map(|m| {
        if let Message::User { content } = m {
            content.iter().rev().find_map(|b| match b {
                ContentBlock::ToolResult { content, .. } => Some(content.clone()),
                _ => None,
            })
        } else {
            None
        }
    })
}