metadata:
version: "1.0.0"
created: "2026-03-02"
author: "PAIML Engineering"
description: "Correctness and performance specification for normalization kernels"
references:
- "GH-381: RMSNorm 4.2x slower than ndarray"
- "GH-382: LayerNorm 4.2x slower than ndarray"
- "Ba, Kiros & Hinton (2016). Layer Normalization. arXiv:1607.06450"
- "Zhang & Sennrich (2019). Root Mean Square Layer Normalization. NeurIPS"
issues:
- "https://github.com/paiml/aprender/issues/381"
- "https://github.com/paiml/aprender/issues/382"
equations:
rmsnorm:
formula: "y_i = x_i / sqrt(mean(x²) + ε) * γ_i"
expanded: "y_i = x_i * (1 / sqrt(Σ_j x_j² / n + ε)) * γ_i"
domain: "x ∈ R^n, γ ∈ R^n, ε > 0"
properties:
- scale_equivariant: "rmsnorm(α·x, γ) = sign(α)·rmsnorm(x, γ) for α > 0"
- unit_scale: "mean(rmsnorm(x, ones)²) ≈ 1.0"
- no_centering: "RMSNorm does NOT subtract mean (unlike LayerNorm)"
layernorm:
formula: "y_i = (x_i - μ) / sqrt(σ² + ε) * γ_i + β_i"
expanded: |
μ = Σ_j x_j / n
σ² = Σ_j (x_j - μ)² / n
y_i = (x_i - μ) / sqrt(σ² + ε) * γ_i + β_i
domain: "x ∈ R^n, γ ∈ R^n, β ∈ R^n, ε > 0"
properties:
- zero_mean: "mean(layernorm(x, ones, zeros)) ≈ 0.0"
- unit_variance: "var(layernorm(x, ones, zeros)) ≈ 1.0"
- shift_invariant: "layernorm(x + c, γ, β) == layernorm(x, γ, β)"
- scale_invariant: "layernorm(α·x, γ, β) == layernorm(x, γ, β) for α > 0"
implementation:
multi_pass_auto_vectorizable:
description: "Norms MUST use multi-pass auto-vectorizable pattern"
rmsnorm_pattern: |
// Pass 1: sum of squares — auto-vectorizable
let mut sum_sq = 0.0f32;
for &val in slice { sum_sq += val * val; }
let inv_rms = 1.0 / (sum_sq / n as f32 + eps).sqrt();
// Pass 2: fused normalize + scale — auto-vectorizable
for i in 0..n { out[i] = slice[i] * inv_rms * weight[i]; }
layernorm_pattern: |
// Pass 1: mean — auto-vectorizable
let mut sum = 0.0f32;
for &val in slice { sum += val; }
let mean = sum / n as f32;
// Pass 2: variance — auto-vectorizable
let mut var_sum = 0.0f32;
for &val in slice { let d = val - mean; var_sum += d * d; }
let inv_std = 1.0 / (var_sum / n as f32 + eps).sqrt();
// Pass 3: fused normalize + scale + shift — auto-vectorizable
for i in 0..n { out[i] = (slice[i] - mean) * inv_std * weight[i] + bias[i]; }
rationale: "Explicit indexed loops with pre-allocated output enable LLVM SIMD auto-vectorization"
pre_allocated_output:
description: "MUST pre-allocate output buffer and use Tensor::from_vec"
assertion: "No Tensor::new(&data, ...) in norm implementations"
one_path_rule:
description: "Module norms delegate to functional norms (UCBD §4)"
canonical_sources:
- "src/nn/functional.rs::rms_norm"
- "src/nn/functional.rs::layer_norm"
performance:
benchmark_crate: "aprender-bench-compute"
benchmark_file: "benches/norms.rs"
reference: "ndarray scalar implementation (same 2/3-pass algorithm)"
bounds:
rmsnorm_4096:
min_ratio_vs_ndarray: 0.40
target_ratio: 0.65
measured_ratio: 0.51
measured_date: "2026-03-02"
history:
- { date: "2026-03-01", ratio: 0.22, note: "Before auto-vectorizable loops" }
- { date: "2026-03-02", ratio: 0.51, note: "After multi-pass + from_vec" }
layernorm_4096:
min_ratio_vs_ndarray: 0.35
target_ratio: 0.55
measured_ratio: 0.42
measured_date: "2026-03-02"
history:
- { date: "2026-03-01", ratio: 0.24, note: "Before auto-vectorizable loops" }
- { date: "2026-03-02", ratio: 0.42, note: "After 3-pass + from_vec" }
remaining_gap_analysis: |
2x remaining gap is from ndarray's tighter memory model:
- ndarray operates directly on contiguous buffer with zero metadata
- Tensor::from_vec still has shape Vec allocation + autograd field setup
- ndarray's dot/sum use BLAS-like tight loops with prefetching
- Further optimization: fused norm+GEMV at inference time (see kernel-fusion-v1.yaml)
falsification:
tests_files:
- "tests/contracts/rmsnorm_contract.rs"
- "tests/contracts/layernorm_contract.rs"
FALSIFY-NORM-001:
name: "RMSNorm unit scale"
assertion: "mean(rmsnorm(x, ones)²) ≈ 1.0 (within 1e-4)"
status: "PASS"
FALSIFY-NORM-002:
name: "LayerNorm zero mean"
assertion: "mean(layernorm(x, ones, zeros)) ≈ 0.0 (within 1e-5)"
status: "PASS"
FALSIFY-NORM-003:
name: "LayerNorm unit variance"
assertion: "var(layernorm(x, ones, zeros)) ≈ 1.0 (within 1e-4)"
status: "PASS"
FALSIFY-NORM-004:
name: "LayerNorm shift invariance"
assertion: "layernorm(x + 100, γ, β) ≈ layernorm(x, γ, β) (within 1e-4)"
status: "PASS"
FALSIFY-NORM-005:
name: "RMSNorm reduces to LayerNorm for zero-mean inputs"
assertion: "If mean(x) = 0, rmsnorm ≈ layernorm (up to bias term)"
status: "PASS"
FALSIFY-NORM-006:
name: "Numerical stability with large inputs"
assertion: "rmsnorm([1e6, 1e6, ...]) produces finite values (no overflow)"
status: "PASS"
qa_gate:
id: "F-NORM-001"
name: "Normalization Kernel Contract"
checks:
- "Multi-pass auto-vectorizable pattern used in all norm implementations"
- "No Tensor::new(&data, ...) in norm functions"
- "Benchmark ratio >= min_ratio for all measured bounds"
- "All FALSIFY tests pass"
pass_criteria: "All checks pass"