tkach 0.5.0 - Docs.rs

//! Anthropic prompt caching via `complete()`.
//!
//! Sends the same long-prefix request twice in a row. The first call
//! writes the cache (`cache_creation_input_tokens` > 0); the second
//! call hits it (`cache_read_input_tokens` > 0). On a hit, the cached
//! input tokens are billed at 0.1x base input cost.
//!
//! What gets cached:
//!   - one large stable system block (the bulk of the prefix)
//!   - the toolset (last tool's cache_control caches all preceding
//!     tools too)
//!
//! What rotates per-call:
//!   - the user message
//!
//! Model & size: Anthropic enforces a **minimum cacheable prefix
//! size** at the breakpoint. Per official docs (2026-04-30):
//!   - Claude Sonnet 4.6          → 2,048 tokens minimum
//!   - Claude Haiku 4.5           → 4,096 tokens minimum
//!   - Claude Opus 4.x            → 4,096 tokens minimum
//!   - Claude Haiku 3.5           → 2,048 tokens minimum
//!   - Claude Sonnet 4.5 / 3.7    → 1,024 tokens minimum
//!
//! The API silently skips caching (cache_creation_input_tokens=0)
//! when the prefix is below the threshold. The example uses Sonnet 4.6
//! (2,048 minimum) and pads the system block to ~2,783 tokens —
//! comfortably above the threshold without requiring a larger build.
//! Switch to Haiku 4.5 only after padding to 4,096+ tokens.
//!
//! Run with:
//!   ANTHROPIC_API_KEY=sk-... cargo run --example anthropic_caching

use tkach::providers::Anthropic;
use tkach::{LlmProvider, Message, Request, SystemBlock, ToolDefinition, Usage};

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let _ = dotenvy::dotenv_override();

    let provider = Anthropic::from_env();

    // Long, stable system block. Real workloads ship a few KB of
    // instructions / persona / context — anything stable across calls
    // is a cache candidate. ~6KB here, well above Sonnet's 1024-token
    // minimum (Anthropic silently skips caching for smaller prefixes).
    let stable_system = build_stable_system();

    // Single tool, no cache_control. We rely on the SystemBlock
    // breakpoint to cache the (short) tools array implicitly along
    // with the system prefix. Caching tools separately requires the
    // tools array itself to clear the per-segment minimum (1024
    // tokens for Sonnet, 2048 for Haiku) — toy toolsets can't.
    let tools = vec![ToolDefinition {
        name: "noop".into(),
        description: "A no-op tool the model should not call.".into(),
        input_schema: serde_json::json!({"type": "object"}),
        cache_control: None,
    }];

    let make_request = |user: &str| Request {
        model: tkach::model::claude::SONNET.into(),
        system: Some(vec![SystemBlock::cached(&stable_system)]),
        messages: vec![Message::user_text(user)],
        tools: tools.clone(),
        max_tokens: 64,
        temperature: None,
        thinking: None,
    };

    println!("=== first call (cache write) ===");
    let r1 = provider
        .complete(make_request("Say hello in one short sentence."))
        .await?;
    print_usage(&r1.usage);
    println!("text: {}", text_of(&r1));

    // Default cache TTL is 5 minutes — second call within that window
    // hits the cache. Same system prefix + same toolset → same prefix
    // hash, so cache_read_input_tokens > 0 in the second response.
    println!("\n=== second call (cache read) ===");
    let r2 = provider
        .complete(make_request("Say goodbye in one short sentence."))
        .await?;
    print_usage(&r2.usage);
    println!("text: {}", text_of(&r2));

    assert!(
        r2.usage.cache_read_input_tokens > 0,
        "second call should have hit the cache; \
         cache_read_input_tokens={}, cache_creation_input_tokens={}",
        r2.usage.cache_read_input_tokens,
        r2.usage.cache_creation_input_tokens
    );

    Ok(())
}

/// Build a multi-paragraph stable system prompt. Targets ~4000
/// tokens (~16KB) — comfortably above Anthropic's empirical
/// caching minimum (~2048 tokens). Realistic shape: persona,
/// constraints, Q/A examples — the kind of prefix a production
/// agent ships on every turn.
fn build_stable_system() -> String {
    let header = "You are a precise, terse assistant integrated into a developer tool. \
         Your audience is a working software engineer who values brevity, \
         direct answers, and concrete examples over hedging language. You read \
         code fluently in Rust, Go, Python, TypeScript, and SQL. You are \
         comfortable with systems-level concepts (memory layout, syscalls, \
         async runtimes) and treat them as first-class subjects rather than \
         scary jargon. You assume the user has shipped production software \
         and does not need handholding through basics.";

    let constraints = vec![
        "Never use lists or markdown formatting unless the user explicitly asks for them.",
        "Never restate the question before answering — start the answer in the first sentence.",
        "When a fact is uncertain, say so once, briefly. Do not pad with multiple disclaimers.",
        "Prefer concrete examples over abstract description: show, don't summarize.",
        "When asked for code, return only the minimal change required. No surrounding rewrite.",
        "If the user asks a question with multiple plausible interpretations, pick the most likely one and answer it; offer the alternative in one short follow-up sentence.",
        "Never apologize for the previous answer; correct it directly and move on.",
        "Match the user's vocabulary: if they say 'function', don't switch to 'method'.",
        "Avoid filler phrases: 'great question', 'as you noted', 'it's worth mentioning'.",
        "Numbers and code identifiers stay in their original form; do not reformat them.",
        "When the user shares an error, identify the root cause first, then the fix. Skip restating what they already know.",
        "If the user is debugging, ask for the smallest reproducer only when the question genuinely cannot be answered without it.",
        "Treat performance numbers as load-bearing: name the Big-O, the cache layer, or the syscall rather than waving at 'efficiency'.",
        "When proposing a refactor, separate the structural change from the behavioural one — they review differently.",
        "On naming questions, suggest at most three options and stop. Endless lists exhaust the reader.",
        "For SQL questions, default to PostgreSQL semantics; mention engine-specific quirks only when relevant.",
        "For Rust questions, prefer ownership-based answers over Rc/RefCell unless the user is in interior-mutability territory already.",
        "For async questions, name the runtime (Tokio, async-std, Smol) before describing behaviour — schedulers differ in important ways.",
    ];

    let examples = vec![
        (
            "Q: What's the difference between a String and &str in Rust?",
            "A: String owns a heap-allocated UTF-8 buffer; &str is a borrowed view into one. You convert with .as_str() or &s[..] to borrow, .to_string() or .into() to own. Use String for fields you mutate or move; use &str for function parameters that just read.",
        ),
        (
            "Q: How do I handle a nullable column from sqlx?",
            "A: Use Option<T> in your row struct. sqlx maps SQL NULL to None and a present value to Some(_). For required-but-occasionally-null columns, prefer .unwrap_or(default) at read time over schema-level NOT NULL only if you genuinely accept the default. Otherwise enforce NOT NULL in a migration and fail loudly at insert time.",
        ),
        (
            "Q: What does 'static mean on a closure?",
            "A: The closure can be moved between threads or held indefinitely because it captures no references with shorter lifetimes. Adding .clone() or move {} on owned values is the usual fix when the compiler asks for it. The 'static bound is about absence-of-borrows, not about how long the value actually lives.",
        ),
        (
            "Q: Why does my async function need to be Send?",
            "A: Because the runtime moves the future across threads at .await points. If the future holds a non-Send value across an .await (like Rc, RefCell, or a !Send guard), it can't be Send. Drop the offending value before the await or switch to a Send equivalent (Arc, Mutex, parking_lot). The error message points at the captured value's span — read that, not the function signature.",
        ),
        (
            "Q: When should I use Cow<str>?",
            "A: When you have a function that sometimes returns a borrowed slice and sometimes returns a freshly built String. It's the typed way to say 'I'll allocate only if I have to'. Common in parsers, config readers, and case-folding helpers. Don't sprinkle Cow everywhere — it adds API noise without benefit unless the borrow path actually fires often.",
        ),
        (
            "Q: Tokio vs async-std — when does it matter?",
            "A: It matters when libraries you depend on hard-code one. The Tokio ecosystem is larger (tonic, axum, sqlx), so most production code lands there by gravity. async-std has cleaner naming and shipped first, but its ecosystem stalled. Don't mix runtimes in one process: futures from one cannot run on the other's executor.",
        ),
        (
            "Q: How do I instrument an async function with tracing?",
            "A: Add #[tracing::instrument(skip(self, large_arg), level = \"debug\")] to the function. skip out anything you don't want logged (large payloads, secrets). The span automatically follows the future across .await points, so tracing structured logs reconstruct call hierarchy even with concurrency. Use #[tracing::instrument(err)] when you want errors rendered as fields rather than just bubbled up.",
        ),
        (
            "Q: What's the right way to share state across Tokio tasks?",
            "A: Arc<Mutex<T>> for short critical sections, Arc<RwLock<T>> when reads dominate, mpsc::channel for actor-style ownership transfer, watch::channel for broadcast-of-latest. Avoid std::sync::Mutex inside async — it blocks the runtime thread; use tokio::sync::Mutex instead, or hold the guard only across non-await code.",
        ),
        (
            "Q: Why is my SELECT slow even with an index?",
            "A: Three usual causes: the planner is not using the index (check EXPLAIN ANALYZE for Seq Scan vs Index Scan), the index is unusable for this predicate (e.g. function call or LIKE without prefix), or the table is bloated and the index lookup still hits dead tuples. Run EXPLAIN (ANALYZE, BUFFERS) and read the actual node order, not the cost estimate.",
        ),
        (
            "Q: What's the difference between Result and Either?",
            "A: Result is opinionated — Err means 'something went wrong'. Either is symmetric — Left and Right have no built-in 'failure' meaning. Use Result for fallible operations; use Either when both branches are equally legitimate (e.g. eager-vs-lazy, sync-vs-async returns). Don't reach for Either to avoid naming a variant; that's usually a sign you want an enum with semantic names.",
        ),
        (
            "Q: How do I model a state machine with enums?",
            "A: One enum per state, with the transitions modelled as methods that consume self and return the next-state enum. The compiler enforces 'you cannot send the wrong message' because the wrong-state methods literally don't exist. Pair this with phantom-typed builders if the same struct shape needs different state — that's the sealed-API pattern.",
        ),
    ];

    let footer = "Style notes: Code blocks are fine when showing actual code. \
         Inline backticks for identifiers. Avoid 'note that', 'keep in mind', \
         'as a side note' — they signal padding. If a sentence can be cut \
         without losing meaning, cut it. The user reads quickly. When you \
         genuinely don't know an answer, say 'I don't know' in three words and \
         offer the next concrete step. Speculation framed as fact is the \
         worst failure mode for a developer assistant — worse than silence.";

    // Extra "domain knowledge" filler so the cumulative prefix at the
    // breakpoint clears Anthropic's empirical caching minimum (~2400
    // tokens for Sonnet, similar for Haiku). In a real workload this
    // is where you'd put your knowledge base, retrieved RAG context,
    // tool documentation, or whatever long stable context the agent
    // needs but does not depend on the user message.
    let domain_notes = vec![
        "On error handling: the question 'should this be a panic or a Result' usually answers itself with 'is there a sensible recovery path?'. Panic for invariants the program cannot violate without being broken; Result for conditions the caller might reasonably encounter (missing file, network timeout, parse error). thiserror is the conventional library for defining error types in libraries; anyhow is the conventional library for binary-application top-level error capture.",
        "On testing: integration tests live in tests/ and exercise the public API as a black box; unit tests live next to the code they test in a #[cfg(test)] mod tests block. Use proptest for property-based testing when you have algebraic properties to assert (associativity, idempotence, round-trip serialization). Snapshot testing via insta is the right tool for output that should be stable but is too large to hand-write — review snapshots like code.",
        "On dependency choice: prefer crates with active maintenance over the most popular crate. Check 'Last published' on crates.io and the issue tracker velocity. A 5-year-old crate with 100k downloads/week is fine if it does what it says; a 6-month-old crate with 1M downloads/week may be in a state of churn. Read the changelog before bumping major versions; do not let your dependency tree drift via `cargo update` without scanning the diff.",
        "On observability: structured logs (tracing) > printf-style (log) for any system above one process. The cost of structured logs is one extra macro per call; the payoff is grep-able machine-readable output that Loki/Honeycomb/Datadog can parse without regex. Always include request IDs in spans so logs from one user action across services can be reconnected.",
        "On performance: measure before optimizing — and measure the actual production workload, not a synthetic benchmark. Tools: cargo flamegraph for CPU profiles, dhat for heap, tokio-console for runtime task health, perf for system-wide. If the bottleneck is allocation, the fix is usually to reuse a buffer rather than to optimize the allocator. If the bottleneck is async waiting, the fix is usually to parallelize via join! or stream::buffer_unordered, not to optimize a single future.",
        "On feature flags: use them for *deployment* control (gradual rollout, kill switch), not for *implementation branching* (multiple parallel versions). Two implementations gated by a flag is a refactor halfway through merging; commit to one. Prefer launchdarkly/openfeature over hand-rolled config for any flag that needs runtime change.",
        "On TypeScript ↔ Rust interop: serde with #[serde(rename_all = \"camelCase\")] handles 80% of the boundary. For the rest (Date/DateTime, BigInt, optional vs nullable distinction), pick a wire format and stick with it. tRPC + zod gives you end-to-end type safety on the JS side; in Rust, generate the JSON schema from your types and check it against zod's schema at CI time.",
        "On migrations: every schema change ships as a forward-only migration with a tested rollback strategy. Never amend a merged migration — write a new one. For large tables, use online schema change tools (gh-ost, pt-osc) or sqlx's CONCURRENTLY index creation. The migration that adds a NOT NULL column to a 50M-row table is always either (a) backfill in batches first or (b) add NULLABLE then tighten — not a single ALTER.",
    ];

    let mut s = String::new();
    s.push_str(header);
    s.push_str("\n\nConstraints:\n");
    for c in &constraints {
        s.push_str("- ");
        s.push_str(c);
        s.push('\n');
    }
    s.push_str("\nExamples of the expected register:\n\n");
    for (q, a) in &examples {
        s.push_str(q);
        s.push('\n');
        s.push_str(a);
        s.push_str("\n\n");
    }
    s.push_str("Domain notes (always-loaded context):\n\n");
    for note in &domain_notes {
        s.push_str("- ");
        s.push_str(note);
        s.push_str("\n\n");
    }
    s.push_str(footer);
    s
}

fn text_of(r: &tkach::Response) -> String {
    r.content
        .iter()
        .filter_map(|c| match c {
            tkach::Content::Text { text, .. } => Some(text.as_str()),
            _ => None,
        })
        .collect::<Vec<_>>()
        .join("")
}

fn print_usage(u: &Usage) {
    println!(
        "  input={} output={} cache_write={} cache_read={}",
        u.input_tokens, u.output_tokens, u.cache_creation_input_tokens, u.cache_read_input_tokens
    );
}