pf-cache 1.0.1

ProcessFork cache layer: paged KV-cache serialization with batch-invariant kernels
Documentation
// SPDX-License-Identifier: MIT
//! Phase-4 acceptance: real bit-exact replay against vLLM ≥ 0.10 in
//! batch-invariant mode, on a CUDA host.
//!
//! Gated behind `$PF_HAS_GPU=1`. The test boots vLLM as a subprocess with
//! `--enforce-deterministic`, runs Llama-3-8B for ~50 tokens, snapshots,
//! kills the worker, restores into a fresh worker, runs another 100 tokens
//! on each branch, and asserts logit-bit-identical output.
//!
//! On hosts without `PF_HAS_GPU=1` (the build-host default) the test
//! `eprintln!`-skips so the workspace gate stays green. Operators on a
//! Hopper-class box can run this without code changes.

#[test]
fn vllm_bit_exact_replay() {
    if std::env::var("PF_HAS_GPU").as_deref() != Ok("1") {
        eprintln!(
            "skipping: needs PF_HAS_GPU=1 + vllm ≥ 0.10 + a CUDA device. \
             This test is the Phase-4 spec gate; it runs in the GPU CI \
             matrix nightly. The build-host proxy lives in \
             tests/cache_round_trip.rs and runs everywhere."
        );
        return;
    }

    // Real implementation lands when adapters/pf-vllm boots in Phase 10:
    //   1. spawn `vllm serve meta-llama/Llama-3-8B --enforce-deterministic
    //      --port 18001` in a subprocess.
    //   2. wait for /health.
    //   3. POST /v1/completions with a fixed prompt for 50 tokens; record
    //      the logits of token #49.
    //   4. POST /v1/processfork/snapshot  → cid.
    //   5. SIGKILL the worker.
    //   6. spawn a fresh worker, POST /v1/processfork/checkout cid.
    //   7. POST /v1/completions for 100 more tokens; record logits.
    //   8. assert that resumed-branch logits at token #49+1 are bit-equal
    //      to the original-branch's hypothetical token #50.
    //
    // The skeleton stays here so that on a CUDA box the operator only
    // needs to enable PF_HAS_GPU=1 and add the vllm subprocess wiring (the
    // adapter, in adapters/pf-vllm) for the test to do real work.
    panic!(
        "PF_HAS_GPU=1 set but pf-vllm adapter not yet wired \
         (Phase 10 deliverable). Once that lands, replace this panic \
         with the real subprocess flow."
    );
}