apr-cli 0.29.2

CLI tool for APR model inspection, debugging, and operations
Documentation
# Transpose Kernel Contract v1.0.0
# THE SOURCE OF TRUTH for matrix transpose correctness and performance
#
# STATUS: Authoritative — all transpose operations MUST follow this contract
# CONSUMERS:
#   - src/autograd/ops/activation.rs (Tensor::transpose)
#   - src/primitives/matrix.rs (Matrix::transpose)
#   - src/nn/transformer/positional_encoding.rs (transpose_last_two)
#   - crates/aprender-bench-compute/benches/transpose.rs
#
# ENFORCEMENT: benchmark gate + tiled pattern mandate
#
# Mathematical Foundation:
#   A^T[j][i] = A[i][j] for 0 ≤ i < rows, 0 ≤ j < cols
#   Cache analysis: naive transpose has O(cols) stride on writes,
#   causing one cache miss per element at large dimensions.
#   Tiled transpose keeps both strides within TILE×sizeof(f32) ≤ L1.
#
# Citation: Lam, Rothberg & Wolf (1991). "The Cache Performance and
#   Optimizations of Blocked Algorithms." ASPLOS IV.

metadata:
  version: "1.0.0"
  created: "2026-03-02"
  author: "PAIML Engineering"
  description: "Correctness and performance specification for matrix transpose"
  references:
    - "GH-388: Transpose 242x slower than ndarray at attention shapes"
    - "Lam, Rothberg & Wolf (1991). Cache Performance of Blocked Algorithms. ASPLOS"
    - "contracts/tensor-layout-v1.yaml (LAYOUT-002: row-major mandate)"
  issues:
    - "https://github.com/paiml/aprender/issues/388"

# =============================================================================
# KERNEL EQUATION
# =============================================================================

equations:
  transpose_2d:
    formula: "B[j * rows + i] = A[i * cols + j]"
    domain: "∀i ∈ [0, rows), ∀j ∈ [0, cols)"
    properties:
      - involution: "transpose(transpose(A)) == A"
      - shape: "shape(transpose(A)) == (cols, rows) where shape(A) == (rows, cols)"
      - trace_invariant: "trace(A) == trace(transpose(A)) for square A"
      - determinant_invariant: "det(A) == det(transpose(A))"

  transpose_last_two:
    formula: "B[b][h][j][i] = A[b][h][i][j]"
    domain: "∀b ∈ [0, batch), ∀h ∈ [0, heads), ∀i ∈ [0, seq), ∀j ∈ [0, dim)"
    description: "Batched transpose of last two dimensions (attention K^T)"

# =============================================================================
# IMPLEMENTATION MANDATES
# =============================================================================

implementation:
  simd_transpose_delegation:
    description: "Tensor::transpose delegates to trueno's AVX2 8×8 micro-kernel"
    contract: "provable-contracts/contracts/transpose-kernel-v1.yaml"
    assertion: "Calls trueno::blis::transpose::transpose(rows, cols, src, &mut data)"
    rationale: |
      AVX2 8×8 in-register transpose via 3-phase shuffle/permute.
      8 contiguous 32-byte stores per block (vs 64 scattered 4-byte stores).
      4.0x improvement at 2048×128 (459M → 1.85G).
    location: "src/autograd/ops/activation.rs:Tensor::transpose"

  tiled_transpose:
    description: "Matrix::transpose and transpose_last_two use TILE=32 blocking"
    tile_size: 32
    rationale: "Fallback for non-Tensor paths; auto-vectorizable by LLVM"
    locations:
      - "src/primitives/matrix.rs:Matrix::transpose"
      - "src/nn/transformer/positional_encoding.rs:transpose_last_two"

  zero_copy_output:
    description: "MUST use Tensor::from_vec / Matrix::from_vec for output"

  row_major:
    description: "Both input and output are row-major (LAYOUT-002)"

# =============================================================================
# PERFORMANCE BOUNDS
# =============================================================================

performance:
  benchmark_crate: "aprender-bench-compute"
  benchmark_file: "benches/transpose.rs"
  reference: "ndarray .t().to_owned() (cache-oblivious algorithm)"

  bounds:
    transpose_2048x128:
      min_ratio_vs_ndarray: 0.20
      target_ratio: 0.50
      measured_ratio: 0.35
      measured_date: "2026-03-02"
      history:
        - { date: "2026-03-01", ratio: 0.004, note: "Naive double loop (242x slower)" }
        - { date: "2026-03-02", ratio: 0.07, note: "Tiled 16×16" }
        - { date: "2026-03-02", ratio: 0.15, note: "Tiled 16×16 + from_vec" }
        - { date: "2026-03-02", ratio: 0.068, note: "Tiled 32×32 + src_base hoist (459 M vs 6.79 G)" }
        - { date: "2026-03-02", ratio: 0.28, note: "AVX2 8×8 micro-kernel (1.85 G vs 6.61 G)" }
        - { date: "2026-03-02", ratio: 0.35, note: "Two-level tiling 64×64/8×8 (2.34 G vs 6.61 G)" }

    transpose_4096x4096:
      min_ratio_vs_ndarray: 0.50
      target_ratio: 0.90
      measured_ratio: 0.90
      measured_date: "2026-03-02"
      history:
        - { date: "2026-03-02", ratio: 0.35, note: "Tiled 16×16 + from_vec" }
        - { date: "2026-03-02", ratio: 0.48, note: "Tiled 32×32 + src_base hoist (241 M vs 500 M)" }
        - { date: "2026-03-02", ratio: 0.88, note: "AVX2 8×8 micro-kernel (358 M vs 406 M)" }
        - { date: "2026-03-02", ratio: 0.90, note: "Two-level tiling 64×64/8×8 (438 M vs 488 M)" }

  remaining_gap_analysis: |
    Two-level tiling (64×64 outer for L1, 8×8 inner AVX2) reached near-parity:
    - 2048×128: 0.004x → 0.35x (88x improvement from naive baseline)
    - 4096×4096: 0.48x → 0.90x (near parity)

    IMPORTANT: ndarray .t().to_owned() does NOT rearrange data. It flips strides
    (row-major → F-order) then does memcpy via slc.to_vec(). The 6.61G throughput
    at 2048×128 is memcpy bandwidth, not transpose computation. True data-rearranging
    transpose at 2.34G is a legitimate result — the "gap" is an apples-to-oranges
    comparison. At 4096×4096 where both approaches are memory-bound, we reach 0.90x.

# =============================================================================
# FALSIFICATION TESTS
# =============================================================================

falsification:
  tests_file: "src/primitives/matrix_tests.rs"

  FALSIFY-TR-001:
    name: "Involution"
    assertion: "transpose(transpose(A)) == A (bitwise exact)"
    status: "PASS"

  FALSIFY-TR-002:
    name: "Shape correctness"
    assertion: "shape(transpose([3,5])) == (5, 3)"
    status: "PASS"

  FALSIFY-TR-003:
    name: "Element correctness"
    assertion: "transpose(A)[j][i] == A[i][j] for random A"
    status: "PASS"

  FALSIFY-TR-004:
    name: "Non-square correctness"
    assertion: "transpose works for rectangular matrices (2048×128, 128×2048)"
    status: "PASS"

  FALSIFY-TR-005:
    name: "Identity transpose"
    assertion: "transpose(I) == I for identity matrix"
    status: "PASS"

# =============================================================================
# QA GATE
# =============================================================================

qa_gate:
  id: "F-TRANSPOSE-001"
  name: "Transpose Kernel Contract"
  checks:
    - "Tensor::transpose delegates to trueno AVX2 8×8 micro-kernel"
    - "Benchmark ratio >= min_ratio for all measured bounds"
    - "All FALSIFY tests pass"
    - "No naive (non-tiled) transpose in production code"
  pass_criteria: "All checks pass"