aprender-train 0.31.2

metadata:
  version: "1.0.0"
  created: "2026-03-19"
  author: "PAIML Engineering"
  description: "Training loop convergence and gradient correctness"
  references:
    - "Kingma & Ba (2014) Adam: A Method for Stochastic Optimization"

equations:
  loss_decrease:
    lean_theorem: "ProvableContracts.Theorems.CrossEntropy.NonNegativity"
    formula: "E[L(θ_{t+1})] ≤ E[L(θ_t)] for sufficiently small lr"
    domain: "θ ∈ ℝ^d, lr ∈ (0, lr_max)"
    preconditions:
      - "lr > 0.0"
      - "lr < lr_max"
      - "!params.is_empty()"
    postconditions:
      - "loss.is_finite()"
      - "!loss.is_nan()"
    invariants:
      - "Loss monotonically non-increasing (in expectation)"
  gradient_norm:
    formula: "|∇L(θ)| < clip_value after gradient clipping"
    domain: "θ ∈ ℝ^d, clip_value > 0"
    preconditions:
      - "clip_value > 0.0"
      - "!gradients.is_empty()"
      - "gradients.iter().all(|g| g.is_finite())"
    postconditions:
      - "grad_norm <= clip_value"
      - "gradients.iter().all(|g| g.is_finite())"
    invariants:
      - "Gradient norm bounded by clip_value"

proof_obligations:
  - type: bound
    property: "Gradient norm bounded after clipping"
    formal: "|grad| ≤ clip_value after clip_grad_norm"
    applies_to: all
  - type: invariant
    property: "Loss finite"
    formal: "loss is not NaN or Inf at any training step"
    applies_to: all
  - type: equivalence
    property: "CUDA forward matches CPU"
    formal: "|forward_cuda(x) - forward_cpu(x)| < eps per element"
    tolerance: 1.0e-4
    applies_to: all

falsification_tests:
  - id: FALSIFY-TRAIN-001
    rule: "Gradient clipping"
    prediction: "Gradient norm ≤ clip_value after clipping"
    test: "Inject large gradients, verify clipped"
    if_fails: "clip_grad_norm not applied"
  - id: FALSIFY-TRAIN-002
    rule: "Loss finite"
    prediction: "No NaN/Inf in loss for 100 steps"
    test: "Train on random data, check each step"
    if_fails: "Numeric overflow in loss computation"
  - id: FALSIFY-TRAIN-003
    rule: "CUDA/CPU parity"
    prediction: "Forward pass produces same output within tolerance"
    test: "Same input, compare CUDA and CPU outputs"
    if_fails: "CUDA kernel dispatch error"

kani_harnesses:
  - id: train-kani-001
    obligation: "Gradient norm bounded after clipping"
    bound: 4
    strategy: stub_float