aprender-serve 0.33.0

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
metadata:
  version: "1.0.0"
  created: "2026-03-21"
  author: "PAIML Engineering"
  description: "GPU kernel dispatch and PTX parity validation contracts"
  references:
    - "NVIDIA CUDA Programming Guide (2024) — Kernel Launch Configuration"
    - "GH-219: PTX Parity Validation specification"

equations:
  ptx_parity:
    formula: "∀ kernel_pair (batched, single): structural_equivalence(batched.ptx, single.ptx)"
    domain: "6 kernel pairs: RmsNorm, Rope, Swiglu, ResidualAdd, Q4KGemv, Q6KGemv"
    invariants:
      - "Batched kernel has batch dispatch mechanism (ctaid.y or m_dim)"
      - "No u64 shared memory addressing (must use u32)"
      - "Shared memory size matches between batched and reference"
  dispatch_strategy:
    formula: "dispatch = if data_size > gpu_threshold then GPU else CPU"
    domain: "data_size ∈ ℕ, gpu_threshold > 0"
    invariants:
      - "CPU and GPU produce numerically equivalent results"
      - "Dispatch decision is deterministic for given input size"
  kernel_dimensions:
    formula: "grid = (ceil(n/block_x), ceil(m/block_y), batch_size)"
    domain: "block_x, block_y > 0, n, m > 0"
    invariants:
      - "Thread count covers all elements: grid * block ≥ problem_size"
      - "Shared memory does not exceed device limit"
  dispatch_metrics:
    formula: "gpu_ratio = gpu_dispatches / (cpu_dispatches + gpu_dispatches)"
    domain: "dispatches ∈ ℕ"
    invariants:
      - "0 ≤ gpu_ratio ≤ 1"
      - "total_dispatches = cpu_dispatches + gpu_dispatches"

proof_obligations:
  - type: invariant
    property: "PTX parity for all 6 kernel pairs"
    formal: "validate_all_kernel_pairs(dims).all_passed() = true"
    applies_to: all
  - type: invariant
    property: "Dispatch metrics consistency"
    formal: "total_dispatches = cpu_dispatches + gpu_dispatches at all times"
    applies_to: all
  - type: invariant
    property: "CPU-GPU numerical equivalence"
    formal: "|cpu_result - gpu_result| < eps for identical inputs"
    tolerance: 1.0e-3
    applies_to: all
  - type: bound
    property: "GPU ratio bounded"
    formal: "0.0 ≤ gpu_ratio ≤ 1.0"
    applies_to: all

falsification_tests:
  - id: FALSIFY-KERN-001
    rule: "Dispatch metric consistency"
    prediction: "total = cpu + gpu after N dispatches"
    test: test_dispatch_metrics_record_dispatches
    if_fails: "Atomic counter race condition or off-by-one"
  - id: FALSIFY-KERN-002
    rule: "GPU ratio bounds"
    prediction: "gpu_ratio in [0.0, 1.0] including edge cases"
    test: test_dispatch_metrics_gpu_ratio
    if_fails: "Division by zero when no dispatches recorded"
  - id: FALSIFY-KERN-003
    rule: "PTX parity report completeness"
    prediction: "Report covers all 6 kernel pairs"
    test: test_validate_all_kernel_pairs_no_cuda_gh219
    if_fails: "Missing kernel pair registration in validator"
  - id: FALSIFY-KERN-004
    rule: "Latency histogram bounds"
    prediction: "Sum of bucket counts = total latency count"
    test: test_dispatch_metrics_cpu_latency
    if_fails: "Bucket boundary assignment drops or double-counts"

kani_harnesses:
  - id: kern-kani-001
    obligation: "GPU ratio bounded"
    bound: 4
    strategy: bounded_int
  - id: kern-kani-002
    obligation: "Dispatch metrics consistency"
    bound: 8
    strategy: bounded_int