aprender-serve 0.34.0

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
metadata:
  version: "1.0.0"
  created: "2026-03-21"
  author: "PAIML Engineering"
  description: "Model dispatch contracts for CPU/GPU backend selection"
  references:
    - "realizar architecture: unified CPU/GPU dispatch with automatic backend selection"

equations:
  dispatch_decision:
    formula: "backend = if gpu_available ∧ data_size > threshold then GPU else CPU"
    domain: "data_size ∈ ℕ, threshold > 0"
    invariants:
      - "Decision is deterministic for given (gpu_available, data_size, threshold)"
      - "CPU fallback always available"
  dispatch_counter:
    formula: "total = cpu_count + gpu_count (atomic counters)"
    domain: "cpu_count, gpu_count ∈ ℕ"
    invariants:
      - "Counters are monotonically non-decreasing"
      - "Atomic operations ensure thread-safety"
  latency_tracking:
    formula: "avg_latency = sum_latency / count, variance = sum_sq/count - avg^2"
    domain: "count > 0, sum_latency ≥ 0"
    invariants:
      - "min_latency ≤ avg_latency ≤ max_latency"
      - "Histogram bucket counts sum to total count"
  gpu_buffer_limit:
    formula: "exceeds_limit = (size_bytes > max_gpu_buffer_bytes)"
    domain: "size_bytes ∈ ℕ, max_gpu_buffer_bytes > 0"
    invariants:
      - "Operations exceeding GPU buffer limit dispatch to CPU"
      - "Limit check happens before GPU memory allocation"

proof_obligations:
  - type: invariant
    property: "Dispatch counter consistency"
    formal: "total_dispatches = cpu_dispatches + gpu_dispatches at all times"
    applies_to: all
  - type: invariant
    property: "GPU ratio in [0, 1]"
    formal: "0.0 ≤ gpu_ratio() ≤ 1.0, returns 0.0 when total = 0"
    applies_to: all
  - type: invariant
    property: "Latency min/max ordering"
    formal: "min_latency ≤ max_latency after any recording"
    applies_to: all
  - type: invariant
    property: "CPU fallback correctness"
    formal: "CPU path produces valid results when GPU is unavailable"
    applies_to: all

falsification_tests:
  - id: FALSIFY-DISP-001
    rule: "Counter consistency"
    prediction: "total = cpu + gpu after concurrent dispatches"
    test: test_dispatch_metrics_record_dispatches
    if_fails: "Atomic ordering insufficient for consistency"
  - id: FALSIFY-DISP-002
    rule: "GPU ratio zero-total"
    prediction: "gpu_ratio() = 0.0 when no dispatches recorded"
    test: test_dispatch_metrics_gpu_ratio
    if_fails: "Division by zero on empty counters"
  - id: FALSIFY-DISP-003
    rule: "Latency histogram exhaustive"
    prediction: "Sum of all histogram buckets = latency_count"
    test: test_dispatch_metrics_cpu_latency
    if_fails: "Bucket boundary logic drops or double-counts samples"
  - id: FALSIFY-DISP-004
    rule: "Backend auto-detection"
    prediction: "GpuCompute::auto() succeeds on all platforms (CPU fallback)"
    test: test_gpu_compute_auto_creation
    if_fails: "Auto mode does not fall back to CPU when GPU unavailable"
  - id: FALSIFY-DISP-005
    rule: "Latency min ≤ max"
    prediction: "After recording any latency, min ≤ max"
    test: test_dispatch_metrics_gpu_latency
    if_fails: "AtomicU64 fetch_min/fetch_max initialization wrong (MAX/0 init)"

kani_harnesses:
  - id: disp-kani-001
    obligation: "GPU ratio in [0, 1]"
    bound: 4
    strategy: bounded_int
  - id: disp-kani-002
    obligation: "Dispatch counter consistency"
    bound: 8
    strategy: bounded_int