aprender-train 0.41.0

Training & Optimization library with autograd, LoRA, quantization, and model merging
Documentation
metadata:
  version: "1.0.0"
  created: "2026-03-21"
  author: "PAIML Engineering"
  description: "Numerically stable softmax activation contracts"
  references:
    - "Goodfellow et al. (2016) Deep Learning, Section 4.1 Numerical Stability"

equations:
  softmax:
    lean_theorem: "ProvableContracts.Theorems.Softmax.PartitionOfUnity"
    formula: "softmax(x)_i = exp(x_i - max(x)) / sum_j(exp(x_j - max(x)))"
    domain: "x in R^n, n >= 1"
    preconditions:
      - "!a.data().is_empty()"
      - "a.data().iter().all(|v| v.is_finite())"
    postconditions:
      - "ret.len() == a.len()"
      - "ret.data().iter().all(|v| *v >= 0.0)"
      - "(ret.data().iter().copied().sum::<f32>() - 1.0).abs() < 1e-6"
    invariants:
      - "Output sums to 1: sum(softmax(x)) = 1"
      - "All outputs non-negative: softmax(x)_i >= 0"
      - "Monotonic: x_i > x_j implies softmax(x)_i > softmax(x)_j"
  log_sum_exp:
    lean_theorem: "ProvableContracts.Theorems.Softmax.ShiftInvariance"
    formula: "log(sum(exp(x))) = max(x) + log(sum(exp(x - max(x))))"
    domain: "x in R^n"
    preconditions:
      - "!a.data().is_empty()"
      - "a.data().iter().all(|v| v.is_finite())"
    postconditions:
      - "ret.is_finite()"
    invariants:
      - "Subtraction of max prevents overflow in exp()"
      - "Result finite for all finite inputs"

proof_obligations:
  - type: invariant
    property: "Probability distribution"
    formal: "sum(softmax(x)) = 1 and softmax(x)_i >= 0 for all i"
    applies_to: all
  - type: bound
    property: "No overflow"
    formal: "softmax(x) finite for all finite x (via max subtraction)"
    applies_to: all
  - type: equivalence
    property: "Backward gradient correct"
    formal: "|analytical_grad - numerical_grad| < 1e-3"
    tolerance: 1.0e-3
    applies_to: all

falsification_tests:
  - id: FALSIFY-SOFTMAX-001
    rule: "Softmax sums to one"
    prediction: "|sum(softmax(x)) - 1.0| < 1e-6 for arbitrary input"
    test: "autograd::tests::unit_ops::test_softmax_forward"
    if_fails: "Normalization denominator incorrect"
  - id: FALSIFY-SOFTMAX-002
    rule: "Softmax backward gradient check"
    prediction: "Analytical gradient matches finite-difference gradient"
    test: "autograd::tests::unit_ops::test_softmax_backward_gradient_check"
    if_fails: "Jacobian-vector product formula wrong"
  - id: FALSIFY-SOFTMAX-003
    rule: "Numerical stability with large inputs"
    prediction: "softmax([1000, 1001, 1002]) produces valid probabilities, no NaN/Inf"
    test: "train::loss::cross_entropy::tests::test_softmax_numerical_stability"
    if_fails: "Max-subtraction trick not applied before exp()"
  - id: FALSIFY-SOFTMAX-004
    rule: "Distillation softmax sums to one"
    prediction: "Softmax over logits in distillation pipeline produces valid distribution"
    test: "hf_pipeline::distillation::tests::test_softmax_sums_to_one"
    if_fails: "Distillation-specific softmax diverges from reference implementation"
  - id: FALSIFY-SOFTMAX-005
    rule: "Softmax all positive"
    prediction: "All softmax outputs strictly non-negative"
    test: "hf_pipeline::distillation::tests::test_softmax_all_positive"
    if_fails: "Negative values produced by exp() underflow or sign error"

kani_harnesses:
  - id: softmax-kani-001
    obligation: "Output sums to 1"
    bound: 4
    strategy: stub_float