entrenar 0.2.0

Training & Optimization library with autograd, LoRA, quantization, and model merging
Documentation
# Entrenar Roadmap
# Total: 824 hours (103 days @ 8h/day)
# Tickets: ENT-001 through ENT-040
# Vision Sync: docs/specifications/paiml-sai-vision-sync.md

# Stack Integration Points:
#   - trueno 0.7.3: SIMD/GPU compute (ENT-015)
#   - aprender 0.9.1: .apr format output (ENT-019, ENT-024)
#   - realizar: GGUF export (ENT-024)
#   - alimentar: .ald dataset input (ENT-033)

project:
  name: entrenar
  description: Training & Optimization Library
  total_hours: 824
  total_tickets: 40

# Track actual progress vs estimates
progress:
  completed_hours: 0
  completed_tickets: 0
  current_phase: 1

phases:
  - name: "Phase 1: Autograd Engine"
    hours: 200
    status: complete
    tickets:
      - id: ENT-001
        name: "Tape-based context + lifetime tracking"
        estimated_hours: 32
        actual_hours: 4
        status: complete
        notes: "Implemented Context and BackwardOp trait"

      - id: ENT-002
        name: "Matmul backward (gradient check: 200K iters)"
        estimated_hours: 16
        actual_hours: 2
        status: complete
        notes: "CRITICAL: Required for neural network layers. Implemented with property tests (1000+ cases), gradient validation via finite difference."

      - id: ENT-003
        name: "Softmax backward + property tests"
        estimated_hours: 24
        actual_hours: 3
        status: complete
        notes: "Implemented with gradient validation"

      - id: ENT-004
        name: "Layer norm backward (mean/var gradients)"
        estimated_hours: 32
        actual_hours: 3
        status: complete
        notes: "CRITICAL: Normalization for transformer architectures. Implemented with proper gradient computation through mean/variance dependencies. Property tests (1000+ cases) for x, gamma, and beta gradients with finite difference validation."

      - id: ENT-005
        name: "Attention backward (Q,K,V chain rule)"
        estimated_hours: 40
        actual_hours: 4
        status: complete
        notes: "CRITICAL: Core operation for transformer architectures. Implemented scaled dot-product attention with Q @ K^T / sqrt(d_k), row-wise softmax, and V multiplication. Property tests (1000+ cases) for Q, K, V gradients with finite difference validation."

      - id: ENT-006
        name: "ReLU/GELU/Swish backward (8h each)"
        estimated_hours: 24
        actual_hours: 3
        status: complete
        notes: "CRITICAL: Activation functions for neural networks. Implemented ReLU (1h), GELU, and Swish (2h) with property tests (1000+ cases), gradient validation via finite difference."

      - id: ENT-007
        name: "Finite difference validation framework"
        estimated_hours: 16
        actual_hours: 2
        status: complete
        notes: "Implemented with property tests"

      - id: ENT-008
        name: "Mutation testing on backward ops (>80% kill)"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "QUALITY: Validated test suite quality. 93.4% kill rate (312/334 mutants caught) using cargo-mutants. Exceeds 80% requirement. Survived mutants primarily in numerical precision areas within tolerance thresholds. Documented in docs/mutation-testing-ent-008.md."

  - name: "Phase 2: Optimizers"
    hours: 120
    status: complete
    tickets:
      - id: ENT-009
        name: "SGD + momentum"
        estimated_hours: 16
        actual_hours: 1
        status: complete

      - id: ENT-010
        name: "Adam (m/v state tracking)"
        estimated_hours: 24
        actual_hours: 2
        status: complete

      - id: ENT-011
        name: "AdamW (decoupled weight decay)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "CRITICAL: Modern optimizer for transformer training. Implemented with decoupled weight decay (applied directly to parameters, not gradients). Tests validate convergence, weight decay behavior, and difference from Adam with L2."

      - id: ENT-012
        name: "Cosine LR scheduler"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "Learning rate scheduling for training. Implemented cosine annealing (smooth decay following cosine curve). Integrates with all optimizers via LRScheduler trait. Tests validate monotonic decrease and correct lr at key points."

      - id: ENT-013
        name: "Gradient clipping (global norm)"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "CRITICAL: Prevents exploding gradients in deep networks. Implemented global norm clipping (scales all gradients if norm exceeds threshold). Preserves relative magnitudes. Essential for RNN/transformer training."

      - id: ENT-014
        name: "Optimizer convergence property tests"
        estimated_hours: 32
        actual_hours: 2
        status: complete
        notes: "Property tests for SGD, Adam, AdamW with Rosenbrock, ill-conditioned, high-dim, numerical stability scenarios. 29 tests, 1000 proptest cases each."

      - id: ENT-015
        name: "SIMD-accelerated param updates via Trueno"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "SIMD operations via Trueno for AXPY, Adam, AdamW. 18 tests including property tests (500 cases) for numerical equivalence."

  - name: "Phase 3: LoRA"
    hours: 144
    status: complete
    tickets:
      - id: ENT-016
        name: "LoRA layer (A, B matrices + merge)"
        estimated_hours: 32
        actual_hours: 2
        status: complete
        notes: "LoRA layer with A,B matrices, merge/unmerge, scaling. 53 tests including 5 property tests (200 cases each) for mathematical correctness."

      - id: ENT-017
        name: "QLoRA (4-bit base + dequant-on-fly)"
        estimated_hours: 40
        actual_hours: 2
        status: complete
        notes: "4-bit quantized base weights with on-the-fly dequantization. 11 tests including 5 property tests (200 cases each) for quantization correctness."

      - id: ENT-018
        name: "Target module selection (q/k/v/o_proj)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "LoRAConfig with target module selection, layer filtering, all_linear mode. 15 tests including 5 property tests (200 cases each)."

      - id: ENT-019
        name: "Adapter save/load (separate from base)"
        estimated_hours: 24
        actual_hours: 1
        status: complete
        notes: "JSON adapter serialization with round-trip preservation. 10 tests including 4 property tests (100 cases each)."

      - id: ENT-020
        name: "Memory benchmarks (QLoRA vs full FP16)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "Memory comparison benchmarks for LoRA vs QLoRA. 11 tests including 3 property tests (100 cases each)."

      - id: ENT-021
        name: "Gradient flow tests (frozen base + trainable adapters)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "Gradient flow validation: frozen base, trainable A/B. 14 tests including 4 property tests (100 cases each)."

  - name: "Phase 4: Quantization"
    hours: 136
    status: complete
    tickets:
      - id: ENT-022
        name: "Fake quantize (STE backward)"
        estimated_hours: 24
        actual_hours: 2
        status: complete
        notes: "Fake quantization for QAT with STE backward. 17 tests including 5 property tests (200 cases each)."

      - id: ENT-023
        name: "PTQ calibration (min-max, percentile)"
        estimated_hours: 32
        actual_hours: 2
        status: complete
        notes: "PTQ calibration with min-max, percentile, and moving average methods. 15 tests including 5 property tests (200 cases each)."

      - id: ENT-024
        name: "Q4_0/Q8_0 bit packing → GGUF"
        estimated_hours: 40
        actual_hours: 2
        status: complete
        notes: "GGUF-compatible Q4_0/Q8_0 quantization formats with block-wise quantization. 17 tests including 6 property tests (200 cases each)."

      - id: ENT-025
        name: "Per-channel vs per-tensor quantization"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "Per-tensor, per-channel, and per-group quantization with symmetric/asymmetric modes. 16 tests including 6 property tests (200 cases each)."

      - id: ENT-026
        name: "Quantization error property tests"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "Error analysis with MSE/MAE/SQNR metrics, scale sensitivity, outlier impact. 17 tests including 7 property tests (200 cases each)."

      - id: ENT-027
        name: "Accuracy degradation benchmarks"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "Benchmark suite for accuracy degradation with synthetic workloads, weight generators, and quality metrics. 15 tests including 5 property tests (100 cases each)."

  - name: "Phase 5: Model Merging"
    hours: 96
    status: complete
    tickets:
      - id: ENT-028
        name: "TIES (trim + sign election + merge)"
        estimated_hours: 32
        actual_hours: 1
        status: complete
        notes: "TIES merge with trim, sign election, and merge. 17 tests including 8 property tests (200 cases each)."

      - id: ENT-029
        name: "DARE (dropout + rescale)"
        estimated_hours: 24
        actual_hours: 1
        status: complete
        notes: "DARE merge with dropout and rescale. 16 tests including 8 property tests (200 cases each)."

      - id: ENT-030
        name: "SLERP (spherical interp for 2 models)"
        estimated_hours: 24
        actual_hours: 1
        status: complete
        notes: "SLERP merge with spherical interpolation. 17 tests including 9 property tests (200 cases each)."

      - id: ENT-031
        name: "Merge commutativity property tests"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "Comprehensive commutativity tests for SLERP, DARE, TIES. 25 tests including 13 property tests (200 cases each). Tests: commutativity, permutation invariance, identity, boundary conditions."

      - id: ENT-032
        name: "Multi-model ensemble (>2 models)"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "Unified ensemble API with WeightedAverage, IterativeSlerp, Hierarchical strategies. 21 tests including 6 property tests (200 cases each). 108 total merge tests."

  - name: "Phase 6: Declarative Config"
    hours: 64
    status: complete
    tickets:
      - id: ENT-033
        name: "YAML schema + serde deserialization"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "Property tests for YAML round-trip serialization, validation edge cases, JSON interop. 27 tests (20 property @ 200 cases, 7 edge case). 69 total config tests."

      - id: ENT-034
        name: "Auto-feature type inference from data"
        estimated_hours: 24
        actual_hours: 1
        status: complete
        notes: "FeatureType inference from ColumnStats: numeric, categorical, text, datetime, embedding, targets. 29 tests (12 property @ 200 cases). 98 total config tests."

      - id: ENT-035
        name: "Config validation (types, paths, ranges)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "Extended validation with LR bounds, LoRA alpha/dropout/targets, seq_len, save_interval, lr_scheduler. 38 tests (17 unit + 21 property @ 200 cases). 142 total config tests."

      - id: ENT-036
        name: "Single-command training entry point"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "Full CLI with clap: train/validate/info/quantize/merge subcommands. 34 tests (20 unit + 14 property @ 200 cases). 176 total config tests, 610 total tests."

  - name: "Phase 7: Distillation"
    hours: 64
    status: complete
    tickets:
      - id: ENT-037
        name: "KD loss (temperature-scaled softmax)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "DistillationLoss with temperature scaling, KL divergence, cross-entropy blend. 9 tests including softmax, KL divergence validation."

      - id: ENT-038
        name: "Multi-teacher ensemble distillation"
        estimated_hours: 24
        actual_hours: 1
        status: complete
        notes: "EnsembleDistiller with weighted/uniform combining, probability-based averaging. 11 tests including edge cases."

      - id: ENT-039
        name: "Progressive distillation (layer-wise)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "ProgressiveDistiller with MSE/cosine similarity layer-wise losses, weighted layer combinations. 13 tests."

      - id: ENT-040
        name: "Distillation effectiveness property tests"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "12 property tests @ default proptest cases covering loss non-negativity, temperature smoothing, alpha weights, ensemble averaging, MSE/cosine symmetry. 44 total distill tests."

# Summary Statistics
summary:
  total_estimated_hours: 824
  total_actual_hours: 60
  completion_percentage: 100.0
  tickets_complete: 40
  tickets_in_progress: 0
  tickets_pending: 0