zeph-llm 0.19.0

LLM provider abstraction with Ollama, Claude, OpenAI, and Candle backends
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
// SPDX-License-Identifier: MIT OR Apache-2.0

//! Public API types and internal API wire types for the Claude provider.

use serde::{Deserialize, Serialize};

// ── Public types ──────────────────────────────────────────────────────────────

/// Extended or adaptive thinking mode for Claude.
///
/// Serializes with `mode` as tag:
/// `{ "mode": "extended", "budget_tokens": 10000 }` or `{ "mode": "adaptive" }`.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "mode", rename_all = "snake_case")]
pub enum ThinkingConfig {
    Extended {
        budget_tokens: u32,
    },
    Adaptive {
        #[serde(default, skip_serializing_if = "Option::is_none")]
        effort: Option<ThinkingEffort>,
    },
}

/// Effort level for adaptive thinking.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum ThinkingEffort {
    Low,
    #[default]
    Medium,
    High,
}

pub(super) struct ThinkingCapability {
    /// Requires `interleaved-thinking-2025-05-14` beta header when `tool_use` is present.
    pub needs_interleaved_beta: bool,
    /// Opus 4.6 uses `effort` instead of `budget_tokens`; `Extended` config is auto-converted.
    pub prefers_effort: bool,
}

pub(super) fn thinking_capability(model: &str) -> ThinkingCapability {
    // Sonnet 4.6 with tools needs `interleaved-thinking-2025-05-14` beta header.
    let needs_interleaved_beta = model.contains("claude-sonnet-4-6");
    let prefers_effort = model.contains("claude-opus-4-6");
    ThinkingCapability {
        needs_interleaved_beta,
        prefers_effort,
    }
}

/// Maps a `budget_tokens` value to a `ThinkingEffort` level for models that prefer effort-based thinking.
pub(super) fn budget_to_effort(budget_tokens: u32) -> ThinkingEffort {
    if budget_tokens < 5_000 {
        ThinkingEffort::Low
    } else if budget_tokens < 15_000 {
        ThinkingEffort::Medium
    } else {
        ThinkingEffort::High
    }
}

// ── Cache markers ─────────────────────────────────────────────────────────────

pub(super) const CACHE_MARKER_STABLE: &str = "<!-- cache:stable -->";
pub(super) const CACHE_MARKER_TOOLS: &str = "<!-- cache:tools -->";
pub(super) const CACHE_MARKER_VOLATILE: &str = "<!-- cache:volatile -->";

/// Stable agent identity section injected into Block 1 when the base prompt
/// is below the Claude cache minimum (2048 tokens for Sonnet, 4096 for Opus/Haiku).
///
/// This text is purely descriptive and never changes between requests,
/// making it ideal for padding the cacheable block.
pub(super) const AGENT_IDENTITY_PREAMBLE: &str = concat!(
    "\n## Agent Identity\n\nYou are Zeph, a lightweight AI agent built on a hybrid inference architecture.\nZeph version: ",
    env!("CARGO_PKG_VERSION"),
    r#"
Crate: zeph (binary) backed by zeph-core, zeph-llm, zeph-skills, zeph-memory,
       zeph-channels, zeph-tools, and optional feature-gated crates.

## Core Architecture

Zeph is structured as a Rust workspace with a skills-first design:

- **zeph-core**: Agent loop, configuration, channel trait, context builder,
  metrics collection, vault integration, and output redaction.
- **zeph-llm**: `LlmProvider` trait with Ollama, Claude, OpenAI, and Candle
  (GGUF/safetensors) backends, plus an EMA-based router and orchestrator.
- **zeph-skills**: SKILL.md parser (agentskills.io spec), embedding-based
  semantic matcher, hot-reload via `notify`, and self-learning evolution.
- **zeph-memory**: Dual-store with SQLite (conversation history) and Qdrant
  (semantic vector search), plus a SemanticMemory orchestrator.
- **zeph-channels**: Telegram adapter (teloxide + streaming), CLI channel.
- **zeph-tools**: `ToolExecutor` trait, `ShellExecutor`, `WebScrapeExecutor`,
  `CompositeExecutor`, and an audit trail layer.
- **zeph-mcp**: MCP client via `rmcp`, multi-server lifecycle management,
  Qdrant-backed tool registry. Feature-gated.
- **zeph-a2a**: A2A protocol client and server, agent discovery, JSON-RPC 2.0.
  Feature-gated.
- **zeph-tui**: ratatui-based TUI dashboard with real-time metrics.
  Feature-gated.
- **zeph-scheduler**: Cron-based periodic task scheduler with SQLite persistence.
  Feature-gated.
- **zeph-gateway**: HTTP gateway for webhook ingestion with bearer auth.
  Feature-gated.
- **zeph-index**: AST-based code indexing and semantic retrieval.
  Feature-gated.

## Operational Principles

Zeph operates under the following core principles:

1. **Hybrid inference**: Requests are routed to the most appropriate provider
   based on capability requirements, latency, and historical success rates.
   The EMA router tracks per-provider rolling averages and prefers high-
   performing providers. A Thompson Sampling option enables exploration.

2. **Skills-first dispatch**: Before invoking generic tool use, Zeph matches
   the user request against its skill registry. Skills are Markdown documents
   following the SKILL.md specification. The semantic matcher uses embedding
   cosine similarity to rank candidates. Self-learning updates skill weights
   based on outcome feedback.

3. **Semantic memory**: All conversations are persisted to SQLite. Semantically
   relevant past exchanges are retrieved from Qdrant and injected into the
   context window. This enables cross-session continuity without bloating
   the prompt with raw history.

4. **Tool safety**: The `ToolExecutor` trait wraps all tool invocations with
   an audit layer. Shell commands are executed in a sandboxed environment.
   Web scraping respects robots.txt. All tool outputs are redacted for
   secrets before returning to the model.

5. **Multi-channel I/O**: Zeph supports CLI, Telegram, and TUI channels.
   The `AnyChannel` enum in `main.rs` dispatches to the appropriate adapter.
   All channels implement the `Channel` trait defined in `zeph-core`.

6. **MCP integration**: The MCP client connects to external tool servers
   using the Model Context Protocol (2025-11-25 spec). Tools are discovered
   at connection time and registered in the Qdrant tool registry for semantic
   lookup. Per-server declarative policies control which tools may be called.

7. **A2A protocol**: Zeph can act as both A2A client and server, enabling
   agent-to-agent task delegation. The protocol uses JSON-RPC 2.0 transport.

8. **Vault integration**: API keys and secrets are stored in an encrypted
   vault (age encryption). The `VaultProvider` resolves secret references
   in configuration at startup.

9. **Output filtering**: A multi-stage `FilterPipeline` sanitizes LLM output
   before delivery. `CommandMatcher` variants (Exact/Prefix/Regex/Custom)
   match filter triggers. `FilterConfidence` levels (Full/Partial/Fallback)
   indicate match certainty.

10. **Observability**: All operations are instrumented with `tracing` spans
    and events. Optional OpenTelemetry export is available via the `otel`
    feature flag. The TUI dashboard surfaces live metrics.

## Configuration Model

Zeph is configured via TOML (`config.toml`) with `ZEPH_*` environment
variable overrides. The configuration tree mirrors the workspace structure:

- `[agent]`: Core agent settings, instruction files, context window limits.
- `[llm]`: Provider selection, model names, thinking mode, token budgets.
- `[llm.router]`: Routing strategy (ema/thompson), provider chain, weights.
- `[memory]`: SQLite path, Qdrant URL, embedding model, eviction policy.
- `[tools]`: Enabled tool executors, shell sandbox settings, scrape limits.
- `[mcp]`: MCP server list, each with id, url, and per-server policy.
- `[a2a]`: A2A server bind address, discovery endpoints.
- `[scheduler]`: Cron jobs, SQLite state path, sweep interval.
- `[gateway]`: HTTP bind address, bearer token, allowed webhook sources.
- `[tui]`: Refresh rate, color theme, key bindings.

## Instruction File Loading

Zeph loads instruction files at startup to augment the system prompt:

- `zeph.md` and `.zeph/zeph.md` are always loaded (provider-agnostic).
- Provider-specific files: `CLAUDE.md` for Claude, `AGENTS.md` for others.
- Additional files can be specified via `[agent.instructions] extra_files`.
- All files are subject to a 256 KiB size cap and symlink boundary checks.
- Instruction content is injected into the volatile prompt block (Block 2),
  after the environment context and before the skills catalog.

## Prompt Caching Strategy

System prompts are split into cacheable and volatile blocks using HTML
comment markers:

- `<!-- cache:stable -->`: Breakpoint after the base agent identity block.
- `<!-- cache:tools -->`: Breakpoint after the skills and tool catalog.
- `<!-- cache:volatile -->`: Separator before the volatile context block.

Blocks before the volatile marker receive `cache_control: ephemeral`.
The volatile block (environment context, instruction files, retrieved memory)
is never cached as it changes with each request.

Claude's cache hierarchy processes `tools` → `system` → `messages` in order.
The minimum cacheable size is 2048 tokens for Sonnet and 4096 for Opus/Haiku.

## Tool Execution Model

Zeph's tool execution follows a layered model designed for safety and auditability:

### Layer 1: Tool Registry
All available tools are registered in the `CompositeExecutor`. Each tool
implements the `ToolExecutor` trait, which provides:
- `name() -> &str`: Unique tool identifier used in model requests.
- `description() -> &str`: Human-readable description for the model.
- `parameters_schema() -> Value`: JSON Schema defining input parameters.
- `execute(params: Value) -> Result<ToolOutput>`: Async execution entry point.

### Layer 2: Policy Enforcement
Before any tool executes, the `FilteredToolExecutor` wrapper checks:
1. The `ToolPolicy` (AllowList, DenyList, or Unrestricted) from agent config.
2. The subagent-specific `disallowed_tools` denylist from frontmatter.
Both checks must pass for execution to proceed.

### Layer 3: Audit Trail
Every tool invocation is recorded in the audit log with:
- Timestamp (UTC, ISO 8601).
- Tool name and sanitized parameters (secrets redacted).
- Duration in milliseconds.
- Success/failure status and error message if applicable.

### Layer 4: Output Redaction
Tool output passes through the redaction pipeline before returning to the model.
The `SecurityPatterns` registry (17 compiled regexes across 6 categories) scans
for API keys, tokens, passwords, private keys, connection strings, and PII.
Matches are replaced with `[REDACTED:<category>]` placeholders.

## Memory Architecture

### SQLite Store
The `SqliteStore` provides conversation-scoped persistent memory:
- Messages are stored with role, content, timestamp, and metadata.
- Conversations are identified by UUID session identifiers.
- Full-text search is available via SQLite FTS5 extension.
- Soft-delete semantics: entries are marked with `deleted_at` rather than
  physically removed, enabling crash-safe eviction with Qdrant consistency.

### Qdrant Vector Store
The `EmbeddingStore` provides cross-session semantic memory:
- Each message is embedded using the configured embedding model.
- Vectors are stored in Qdrant with payload containing message metadata.
- Retrieval uses approximate nearest-neighbor search (HNSW index).
- Results are ranked by cosine similarity and filtered by relevance threshold.
- The `SemanticMemory` orchestrator coordinates between both stores.

### Memory Eviction
The Ebbinghaus forgetting curve policy scores entries by:
  `score = exp(-t / (S * ln(1 + n)))`
where `t` = time since last access (seconds), `S` = retention strength,
`n` = access count. Low-scoring entries are soft-deleted from SQLite first,
then removed from Qdrant in a subsequent sweep phase. This two-phase approach
ensures consistency even if the agent crashes between phases.

## Self-Learning Skill Evolution

Zeph improves skill selection over time using three complementary mechanisms:

### Feedback Detection
The `FeedbackDetector` analyzes user messages for outcome signals:
- Explicit positive signals: "great", "perfect", "that worked", etc.
- Explicit negative signals: "wrong", "that didn't work", "try again", etc.
- Implicit signals via Jaccard similarity between expected and actual output.
- Correction patterns: "I meant...", "actually...", "you should have...".

### Wilson Score Re-ranking
Skill candidates are re-ranked using Wilson score lower bound:
  `score = (p_hat + z²/2n - z*sqrt(p_hat*(1-p_hat)/n + z²/4n²)) / (1 + z²/n)`
where `p_hat` = success rate, `n` = invocation count, `z` = 1.96 (95% CI).
This penalizes skills with few invocations (high uncertainty) relative to
well-tested skills, preventing premature exploitation.

### Provider EMA Routing
The EMA router tracks per-provider rolling averages of:
- Latency (milliseconds per token).
- Success rate (non-error responses / total requests).
- Cost efficiency (output tokens / input tokens ratio).
The router selects the provider with the best composite EMA score,
with configurable decay factor alpha (default 0.1).

## Security Model

### Vault Integration
All secrets in configuration are referenced by vault identifiers:
  `${vault:key_name}` — resolved at startup from the age-encrypted vault file.
The vault file is stored at `~/.zeph/vault.age` by default.
Resolution happens once during `VaultProvider::resolve()` before any
network connections are established.

### Symlink Boundary Checks
File loading operations (instruction files, skill definitions, subagent
definitions) enforce symlink boundary checks:
- The canonical path of each file is computed via `std::fs::canonicalize()`.
- Files must reside within the project root or designated user config directories.
- Symlinks pointing outside these boundaries are rejected with an error.
- This prevents path traversal attacks via crafted symlink chains.

### Input Sanitization
All skill names, tool names, and user-provided identifiers are sanitized
before inclusion in XML-structured prompts:
- `<`, `>`, `&`, `"`, `'` are HTML-escaped.
- Null bytes and control characters are stripped.
- Names exceeding 256 bytes are truncated with a warning.

## MCP Protocol Integration

The MCP client follows the Model Context Protocol specification (2025-11-25):

### Connection Lifecycle
1. `McpManager::connect()` initiates connections to all configured servers.
2. Each server gets its own `McpClient` with an `rmcp` transport session.
3. On connect, the client calls `list_tools()` to discover available tools.
4. Discovered tools are registered in the Qdrant tool registry for semantic lookup.
5. On disconnect, tools are deregistered and the session is cleaned up.

### Tool Invocation
`McpClient::call_tool(server_id, tool_name, params)` flow:
1. `PolicyEnforcer::check()` validates against per-server policy.
2. Rate limit sliding window is updated.
3. The rmcp transport forwards the JSON-RPC `tools/call` request.
4. The response is deserialized and returned as `ToolOutput`.
5. Violations are logged via `tracing::warn!` with structured fields.

## A2A Protocol Integration

The A2A (Agent-to-Agent) protocol enables Zeph to delegate tasks to peer agents:

### As A2A Client
`A2aClient::send_task(task)` serializes a task request as JSON-RPC 2.0,
sends it to the peer agent's endpoint, and awaits the result. Tasks include
a unique UUID, description, inputs, and expected output schema.

### As A2A Server
`A2aServer::listen()` binds to the configured address and handles incoming
task requests. Each request is dispatched to the agent loop for processing.
Results are returned as JSON-RPC 2.0 responses with the original request ID.

### Agent Discovery
Agents advertise their capabilities via a discovery endpoint (`GET /.well-known/agent.json`).
The discovery document lists supported task types, input/output schemas, and
the agent's public endpoint URL.
"#
);

// ── Context management ────────────────────────────────────────────────────────

/// Request field for Claude server-side context management (compact-2026-01-12 beta).
///
/// Note: the API does not accept a top-level `"type"` discriminator on this object — only
/// `trigger` and `pause_after_compaction` are allowed.
#[derive(Serialize, Clone, Debug)]
pub(super) struct ContextManagement {
    pub trigger: ContextManagementTrigger,
    pub pause_after_compaction: bool,
}

#[derive(Serialize, Clone, Debug)]
pub(super) struct ContextManagementTrigger {
    #[serde(rename = "type")]
    pub kind: &'static str,
    pub value: u32,
}

// ── Wire types ────────────────────────────────────────────────────────────────

#[derive(Serialize, Clone, Debug)]
pub(super) struct SystemContentBlock {
    #[serde(rename = "type")]
    pub block_type: &'static str,
    pub text: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub cache_control: Option<CacheControl>,
}

#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub(super) enum CacheType {
    Ephemeral,
}

#[derive(Serialize, Deserialize, Clone, Debug)]
pub(super) struct CacheControl {
    #[serde(rename = "type")]
    pub cache_type: CacheType,
}

/// Serialization-only parameter for Claude's `thinking` request field.
#[derive(Serialize)]
pub(super) struct ThinkingParam {
    #[serde(rename = "type")]
    pub thinking_type: &'static str,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub budget_tokens: Option<u32>,
}

/// Serialization-only parameter for Claude's `output_config` request field.
/// Used to convey the effort level for adaptive thinking.
#[derive(Serialize)]
pub(super) struct OutputConfig {
    pub effort: ThinkingEffort,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub(super) struct ImageSource {
    #[serde(rename = "type")]
    pub source_type: String,
    pub media_type: String,
    pub data: String,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(tag = "type", rename_all = "snake_case")]
pub(super) enum AnthropicContentBlock {
    Text {
        text: String,
        #[serde(default, skip_serializing_if = "Option::is_none")]
        cache_control: Option<CacheControl>,
    },
    ToolUse {
        id: String,
        name: String,
        input: serde_json::Value,
    },
    ToolResult {
        tool_use_id: String,
        content: String,
        #[serde(default, skip_serializing_if = "std::ops::Not::not")]
        is_error: bool,
        #[serde(default, skip_serializing_if = "Option::is_none")]
        cache_control: Option<CacheControl>,
    },
    Image {
        source: ImageSource,
    },
    Thinking {
        thinking: String,
        signature: String,
    },
    RedactedThinking {
        data: String,
    },
    /// Server-side compaction block returned by the Claude API (compact-2026-01-12 beta).
    /// Must be preserved verbatim and sent back in subsequent turns.
    Compaction {
        summary: String,
    },
}

#[derive(Serialize, Debug)]
pub(super) struct StructuredApiMessage {
    pub role: String,
    pub content: StructuredContent,
}

#[derive(Serialize, Debug)]
#[serde(untagged)]
pub(super) enum StructuredContent {
    Text(String),
    Blocks(Vec<AnthropicContentBlock>),
}

#[derive(Serialize)]
pub(super) struct ApiMessage<'a> {
    pub role: &'a str,
    pub content: &'a str,
}

#[derive(Deserialize)]
pub(super) struct ApiResponse {
    pub content: Vec<ContentBlock>,
    #[serde(default)]
    pub usage: Option<ApiUsage>,
}

#[derive(Deserialize)]
pub(super) struct ContentBlock {
    pub text: String,
}

#[derive(Deserialize)]
pub(super) struct ToolApiResponse {
    pub content: Vec<AnthropicContentBlock>,
    #[serde(default)]
    pub stop_reason: Option<String>,
    #[serde(default)]
    pub usage: Option<ApiUsage>,
}

#[derive(Deserialize, Debug)]
#[allow(clippy::struct_field_names)]
pub(super) struct ApiUsage {
    #[serde(default)]
    pub input_tokens: u64,
    #[serde(default)]
    pub output_tokens: u64,
    #[serde(default)]
    pub cache_creation_input_tokens: u64,
    #[serde(default)]
    pub cache_read_input_tokens: u64,
}

// ── Request body types ────────────────────────────────────────────────────────

#[derive(Serialize)]
pub(super) struct RequestBody<'a> {
    pub model: &'a str,
    pub max_tokens: u32,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub system: Option<Vec<SystemContentBlock>>,
    pub messages: &'a [ApiMessage<'a>],
    #[serde(skip_serializing_if = "std::ops::Not::not")]
    pub stream: bool,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub thinking: Option<ThinkingParam>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub output_config: Option<OutputConfig>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub temperature: Option<f64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub context_management: Option<ContextManagement>,
}

#[derive(Serialize)]
pub(super) struct VisionRequestBody<'a> {
    pub model: &'a str,
    pub max_tokens: u32,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub system: Option<Vec<SystemContentBlock>>,
    pub messages: &'a [StructuredApiMessage],
    #[serde(skip_serializing_if = "std::ops::Not::not")]
    pub stream: bool,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub thinking: Option<ThinkingParam>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub output_config: Option<OutputConfig>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub temperature: Option<f64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub context_management: Option<ContextManagement>,
}

#[derive(Serialize)]
pub(super) struct ToolRequestBody<'a> {
    pub model: &'a str,
    pub max_tokens: u32,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub system: Option<Vec<SystemContentBlock>>,
    pub messages: &'a [StructuredApiMessage],
    pub tools: &'a [serde_json::Value],
    #[serde(skip_serializing_if = "Option::is_none")]
    pub thinking: Option<ThinkingParam>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub output_config: Option<OutputConfig>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub temperature: Option<f64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub context_management: Option<ContextManagement>,
}

#[derive(Serialize)]
pub(super) struct AnthropicTool<'a> {
    pub name: &'a str,
    pub description: &'a str,
    pub input_schema: &'a serde_json::Value,
}

#[derive(Serialize)]
pub(super) struct TypedToolRequestBody<'a> {
    pub model: &'a str,
    pub max_tokens: u32,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub system: Option<Vec<SystemContentBlock>>,
    pub messages: &'a [StructuredApiMessage],
    pub tools: &'a [AnthropicTool<'a>],
    pub tool_choice: ToolChoice<'a>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub thinking: Option<ThinkingParam>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub output_config: Option<OutputConfig>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub temperature: Option<f64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub context_management: Option<ContextManagement>,
}

#[derive(Serialize)]
pub(super) struct ToolChoice<'a> {
    pub r#type: &'a str,
    pub name: &'a str,
}

// ── Validation ────────────────────────────────────────────────────────────────

pub(super) const MIN_MAX_TOKENS_WITH_THINKING: u32 = 16_000;