entrenar 0.1.0

Training & Optimization library with autograd, LoRA, quantization, and model merging
Documentation
# Entrenar Roadmap
# Total: 824 hours (103 days @ 8h/day)
# Tickets: ENT-001 through ENT-040

project:
  name: entrenar
  description: Training & Optimization Library
  total_hours: 824
  total_tickets: 40

# Track actual progress vs estimates
progress:
  completed_hours: 0
  completed_tickets: 0
  current_phase: 1

phases:
  - name: "Phase 1: Autograd Engine"
    hours: 200
    status: complete
    tickets:
      - id: ENT-001
        name: "Tape-based context + lifetime tracking"
        estimated_hours: 32
        actual_hours: 4
        status: complete
        notes: "Implemented Context and BackwardOp trait"

      - id: ENT-002
        name: "Matmul backward (gradient check: 200K iters)"
        estimated_hours: 16
        actual_hours: 2
        status: complete
        notes: "CRITICAL: Required for neural network layers. Implemented with property tests (1000+ cases), gradient validation via finite difference."

      - id: ENT-003
        name: "Softmax backward + property tests"
        estimated_hours: 24
        actual_hours: 3
        status: complete
        notes: "Implemented with gradient validation"

      - id: ENT-004
        name: "Layer norm backward (mean/var gradients)"
        estimated_hours: 32
        actual_hours: 3
        status: complete
        notes: "CRITICAL: Normalization for transformer architectures. Implemented with proper gradient computation through mean/variance dependencies. Property tests (1000+ cases) for x, gamma, and beta gradients with finite difference validation."

      - id: ENT-005
        name: "Attention backward (Q,K,V chain rule)"
        estimated_hours: 40
        actual_hours: 4
        status: complete
        notes: "CRITICAL: Core operation for transformer architectures. Implemented scaled dot-product attention with Q @ K^T / sqrt(d_k), row-wise softmax, and V multiplication. Property tests (1000+ cases) for Q, K, V gradients with finite difference validation."

      - id: ENT-006
        name: "ReLU/GELU/Swish backward (8h each)"
        estimated_hours: 24
        actual_hours: 3
        status: complete
        notes: "CRITICAL: Activation functions for neural networks. Implemented ReLU (1h), GELU, and Swish (2h) with property tests (1000+ cases), gradient validation via finite difference."

      - id: ENT-007
        name: "Finite difference validation framework"
        estimated_hours: 16
        actual_hours: 2
        status: complete
        notes: "Implemented with property tests"

      - id: ENT-008
        name: "Mutation testing on backward ops (>80% kill)"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "QUALITY: Validated test suite quality. 93.4% kill rate (312/334 mutants caught) using cargo-mutants. Exceeds 80% requirement. Survived mutants primarily in numerical precision areas within tolerance thresholds. Documented in docs/mutation-testing-ent-008.md."

  - name: "Phase 2: Optimizers"
    hours: 120
    status: in-progress
    tickets:
      - id: ENT-009
        name: "SGD + momentum"
        estimated_hours: 16
        actual_hours: 1
        status: complete

      - id: ENT-010
        name: "Adam (m/v state tracking)"
        estimated_hours: 24
        actual_hours: 2
        status: complete

      - id: ENT-011
        name: "AdamW (decoupled weight decay)"
        estimated_hours: 16
        actual_hours: 1
        status: complete
        notes: "CRITICAL: Modern optimizer for transformer training. Implemented with decoupled weight decay (applied directly to parameters, not gradients). Tests validate convergence, weight decay behavior, and difference from Adam with L2."

      - id: ENT-012
        name: "Cosine LR scheduler"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "Learning rate scheduling for training. Implemented cosine annealing (smooth decay following cosine curve). Integrates with all optimizers via LRScheduler trait. Tests validate monotonic decrease and correct lr at key points."

      - id: ENT-013
        name: "Gradient clipping (global norm)"
        estimated_hours: 8
        actual_hours: 1
        status: complete
        notes: "CRITICAL: Prevents exploding gradients in deep networks. Implemented global norm clipping (scales all gradients if norm exceeds threshold). Preserves relative magnitudes. Essential for RNN/transformer training."

      - id: ENT-014
        name: "Optimizer convergence property tests"
        estimated_hours: 32
        actual_hours: 0
        status: pending

      - id: ENT-015
        name: "SIMD-accelerated param updates via Trueno"
        estimated_hours: 16
        actual_hours: 0
        status: pending

  - name: "Phase 3: LoRA"
    hours: 144
    status: pending
    tickets:
      - id: ENT-016
        name: "LoRA layer (A, B matrices + merge)"
        estimated_hours: 32
        actual_hours: 0
        status: pending

      - id: ENT-017
        name: "QLoRA (4-bit base + dequant-on-fly)"
        estimated_hours: 40
        actual_hours: 0
        status: pending

      - id: ENT-018
        name: "Target module selection (q/k/v/o_proj)"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-019
        name: "Adapter save/load (separate from base)"
        estimated_hours: 24
        actual_hours: 0
        status: pending

      - id: ENT-020
        name: "Memory benchmarks (QLoRA vs full FP16)"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-021
        name: "Gradient flow tests (frozen base + trainable adapters)"
        estimated_hours: 16
        actual_hours: 0
        status: pending

  - name: "Phase 4: Quantization"
    hours: 136
    status: pending
    tickets:
      - id: ENT-022
        name: "Fake quantize (STE backward)"
        estimated_hours: 24
        actual_hours: 0
        status: pending

      - id: ENT-023
        name: "PTQ calibration (min-max, percentile)"
        estimated_hours: 32
        actual_hours: 0
        status: pending

      - id: ENT-024
        name: "Q4_0/Q8_0 bit packing → GGUF"
        estimated_hours: 40
        actual_hours: 0
        status: pending

      - id: ENT-025
        name: "Per-channel vs per-tensor quantization"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-026
        name: "Quantization error property tests"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-027
        name: "Accuracy degradation benchmarks"
        estimated_hours: 8
        actual_hours: 0
        status: pending

  - name: "Phase 5: Model Merging"
    hours: 96
    status: pending
    tickets:
      - id: ENT-028
        name: "TIES (trim + sign election + merge)"
        estimated_hours: 32
        actual_hours: 0
        status: pending

      - id: ENT-029
        name: "DARE (dropout + rescale)"
        estimated_hours: 24
        actual_hours: 0
        status: pending

      - id: ENT-030
        name: "SLERP (spherical interp for 2 models)"
        estimated_hours: 24
        actual_hours: 0
        status: pending

      - id: ENT-031
        name: "Merge commutativity property tests"
        estimated_hours: 8
        actual_hours: 0
        status: pending

      - id: ENT-032
        name: "Multi-model ensemble (>2 models)"
        estimated_hours: 8
        actual_hours: 0
        status: pending

  - name: "Phase 6: Declarative Config"
    hours: 64
    status: pending
    tickets:
      - id: ENT-033
        name: "YAML schema + serde deserialization"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-034
        name: "Auto-feature type inference from data"
        estimated_hours: 24
        actual_hours: 0
        status: pending

      - id: ENT-035
        name: "Config validation (types, paths, ranges)"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-036
        name: "Single-command training entry point"
        estimated_hours: 8
        actual_hours: 0
        status: pending

  - name: "Phase 7: Distillation"
    hours: 64
    status: pending
    tickets:
      - id: ENT-037
        name: "KD loss (temperature-scaled softmax)"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-038
        name: "Multi-teacher ensemble distillation"
        estimated_hours: 24
        actual_hours: 0
        status: pending

      - id: ENT-039
        name: "Progressive distillation (layer-wise)"
        estimated_hours: 16
        actual_hours: 0
        status: pending

      - id: ENT-040
        name: "Distillation effectiveness property tests"
        estimated_hours: 8
        actual_hours: 0
        status: pending

# Summary Statistics
summary:
  total_estimated_hours: 824
  total_actual_hours: 28
  completion_percentage: 3.4
  tickets_complete: 13
  tickets_in_progress: 0
  tickets_pending: 27