sensorlm-rs 0.1.0

//! Training loop using the Burn `Learner` abstraction.
//!
//! # Workflow
//!
//! 1. Build synthetic (or real) [`DataLoader`]s.
//! 2. Initialise [`SensorLMModel`] and [`AdamConfig`] optimiser.
//! 3. Construct the [`RsqrtScheduler`] learning-rate schedule.
//! 4. Call `LearnerBuilder::build(model, optim, scheduler).fit(train, valid)`.

use burn::{
    data::dataloader::DataLoaderBuilder,
    module::Module,
    optim::AdamConfig,
    record::{CompactRecorder, FullPrecisionSettings, BinFileRecorder},
    tensor::backend::AutodiffBackend,
    train::{
        metric::LossMetric,
        renderer::{MetricState, MetricsRenderer, TrainingProgress},
        LearnerBuilder,
    },
};
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use std::path::Path;
use std::time::Instant;

use crate::config::{SensorLMConfig, TrainingConfig};
use crate::data::dataset::SyntheticSensorDataset;
use crate::model::sensorlm::{SensorLMBatcher, SensorLMModel};
use crate::training::scheduler::RsqrtScheduler;
use crate::error::Result;

// ===========================================================================
// Custom MetricsRenderer — drives indicatif progress bars
// ===========================================================================
//
// Burn's default `CliMetricsRenderer` (used when the `tui` feature is off)
// contains a literal `dbg!(item)` call that dumps structs to stderr on every
// step.  Replacing it with our own renderer stops that noise and gives us
// proper per-step progress bars ticked at the right time: AFTER each GPU
// forward+backward pass completes, not during dataset prefetch.

/// Renders training progress as two `indicatif` progress bars (train + valid).
struct SensorLMRenderer {
    _multi:     MultiProgress,   // kept alive so bars stay grouped in the terminal
    train_bar:  ProgressBar,
    valid_bar:  ProgressBar,
    train_loss: Option<f64>,
    valid_loss: Option<f64>,
    step_start: Instant,
}

impl SensorLMRenderer {
    fn new(train_steps: usize, valid_steps: usize) -> Self {
        let multi = MultiProgress::new();

        let style = ProgressStyle::with_template(
            "{prefix:.bold.cyan} [{bar:45.green/dim}] \
             {pos:>3}/{len} \
             {elapsed_precise} eta {eta_precise}  \
             {msg}",
        )
        .unwrap()
        .progress_chars("█▉▊▋▌▍▎▏ ");

        let train_bar = multi.add(ProgressBar::new(train_steps as u64));
        train_bar.set_style(style.clone());
        train_bar.set_prefix("Train");

        let valid_bar = multi.add(ProgressBar::new(valid_steps as u64));
        valid_bar.set_style(style);
        valid_bar.set_prefix("Valid");

        Self {
            _multi: multi,
            train_bar,
            valid_bar,
            train_loss: None,
            valid_loss: None,
            step_start: Instant::now(),
        }
    }
}

impl MetricsRenderer for SensorLMRenderer {
    /// Called by Burn whenever a metric value is updated during training.
    fn update_train(&mut self, state: MetricState) {
        if let MetricState::Numeric(_entry, val) = state {
            self.train_loss = Some(val);
        }
    }

    /// Called by Burn whenever a metric value is updated during validation.
    fn update_valid(&mut self, state: MetricState) {
        if let MetricState::Numeric(_entry, val) = state {
            self.valid_loss = Some(val);
        }
    }

    /// Called once per completed training step (after GPU backward pass).
    fn render_train(&mut self, item: TrainingProgress) {
        let step     = item.iteration;
        let total    = if item.progress.items_total > 0 {
            // items_total is in samples; divide by batch to get steps
            let batch = (item.progress.items_processed as f64 / step as f64).round() as u64;
            (item.progress.items_total as u64).div_ceil(batch.max(1))
        } else {
            self.train_bar.length().unwrap_or(0)
        };

        self.train_bar.set_length(total);
        self.train_bar.set_position(step as u64);

        let elapsed = self.step_start.elapsed().as_secs_f64();
        self.step_start = Instant::now();

        let msg = match self.train_loss {
            Some(l) => format!(
                "loss {l:.4}  ({elapsed:.1}s/step)  epoch {}/{}",
                item.epoch, item.epoch_total
            ),
            None => format!(
                "{elapsed:.1}s/step  epoch {}/{}",
                item.epoch, item.epoch_total
            ),
        };
        self.train_bar.set_message(msg);
    }

    /// Called once per completed validation step.
    fn render_valid(&mut self, item: TrainingProgress) {
        let step  = item.iteration;
        let total = self.valid_bar.length().unwrap_or(0);
        self.valid_bar.set_position(step.min(total as usize) as u64);

        let msg = match self.valid_loss {
            Some(l) => format!("loss {l:.4}"),
            None    => String::new(),
        };
        self.valid_bar.set_message(msg);
    }
}

/// Memory breakdown for the sensor-encoder attention.
struct AttnMemEstimate {
    /// Bytes for a single attention-score dispatch: `(B, H, chunk, N) × f32`.
    /// Governs per-GPU-kernel compute time; very large values risk OS TDR.
    per_dispatch_bytes: u64,
    /// Bytes kept on the autodiff tape for **one transformer layer**:
    /// `ceil(N/chunk) × 2 (scores + attn) × (B, H, chunk, N) × f32`.
    ///
    /// Burn's backward pass processes layers in reverse order and can release
    /// a layer's tape entries once its gradients have been propagated.  The
    /// peak is therefore dominated by one layer at a time, not all `depth`
    /// layers simultaneously.
    per_layer_bwd_bytes: u64,
    /// Worst-case upper bound: all layers' tapes alive at once.
    /// (Relevant if the autodiff graph is not freed layer-by-layer.)
    all_layers_bwd_bytes: u64,
}

/// Compute attention-memory estimates for the sensor encoder.
///
/// ## Formula correctness note
///
/// The per-layer tape stores **all query positions × all key positions**,
/// regardless of chunk size.  Our implementation slices `q` into windows
/// `start..end.min(seq)`, so the last chunk is always shorter than
/// `chunk_size`.  The total query positions stored is exactly `N`, giving:
///
/// ```text
/// per_layer = 2 (scores + attn) × B × H × N × N × 4 bytes
/// ```
///
/// The naive formula `2 × num_chunks × chunk × N` overcounts by
/// `ceil(N/chunk) × chunk / N ≥ 1`, which can be 20 % or more for large
/// chunks.  We use the exact `N²` formula instead.
fn estimate_attn_memory(
    batch_size: usize,
    depth: usize,
    num_heads: usize,
    num_patches: usize,
    chunk_size: usize,
) -> AttnMemEstimate {
    let effective_chunk = if chunk_size == 0 { num_patches } else { chunk_size };

    // Per-dispatch: the actual (B, H, chunk, N) tensor size.
    // The last chunk may be shorter — use effective_chunk as the upper bound.
    let per_dispatch_bytes = batch_size as u64
        * num_heads as u64
        * effective_chunk as u64
        * num_patches as u64
        * 4;

    // Per-layer backward tape: 2 tensors × B × H × N × N × 4 bytes.
    // This equals full-attention memory and is INDEPENDENT of chunk_size —
    // the sum of all chunk rows is exactly N (last chunk fills the remainder).
    let per_layer_bwd_bytes = 2
        * batch_size as u64
        * num_heads as u64
        * num_patches as u64
        * num_patches as u64
        * 4;

    AttnMemEstimate {
        per_dispatch_bytes,
        per_layer_bwd_bytes,
        all_layers_bwd_bytes: depth as u64 * per_layer_bwd_bytes,
    }
}

/// Default guard limit for the **all-layers** attention-tensor peak when
/// `--vram-gb` is not specified.  Calibrated for 16 GB GPUs (the lowest
/// common denominator for serious training work).
///
/// The all-layers figure is the correct metric because Burn's forward pass
/// builds autodiff tape for every transformer layer before `backward()` runs.
/// At the forward→backward boundary all `depth` layers' chunk tensors are
/// simultaneously in memory.
const ALL_LAYERS_LIMIT_GB: f64 = 11.0; // safe for 16 GB; use --vram-gb for more

/// Fraction of GPU VRAM to allocate to attention tensors when `--vram-gb` is
/// given.  The remaining 30 % covers: model weights + gradients + Adam states
/// (~16 bytes/param) and non-attention activations (Q/K/V projections, MLP,
/// layer-norm intermediates).
const ATTN_VRAM_FRACTION: f64 = 0.70;

/// Hard upper bound for a single GPU dispatch (attention score tensor).
/// Dispatches larger than this risk hitting the OS GPU watchdog (TDR) timeout.
/// 512 MB is generous for Apple Silicon Metal (actual TDR threshold is ~5–15 s
/// of GPU work, corresponding to several GB of data); 200 MB is safe on all
/// platforms.
const DISPATCH_LIMIT_BYTES: u64 = 512 * 1024 * 1024; // 512 MB

/// Soft warning threshold for per-dispatch tensor size.
///
/// 0.5 GB is appropriate for Apple Silicon (Metal has a much more lenient
/// GPU watchdog than Windows discrete GPUs).  Discrete GPU users on Linux
/// or Windows should set `RUST_LOG=warn` and watch for TDR events if they
/// see long step times.
const PER_DISPATCH_WARN_GB: f64 = 0.5;

/// Compute the optimal `attn_chunk_size` for a given batch configuration.
///
/// Maximises chunk size (minimising GPU command-buffer submissions) while
/// keeping each attention-score dispatch ≤ [`DISPATCH_LIMIT_BYTES`].
///
/// Returns `0` when full attention fits within the dispatch limit (1 submission
/// per layer instead of `ceil(N / chunk)`).
fn optimal_chunk_size(batch_size: usize, num_heads: usize, num_patches: usize) -> usize {
    // Dispatch bytes for one chunk: B × H × chunk × N × sizeof(f32)
    // Solving for chunk: chunk ≤ LIMIT / (B × H × N × 4)
    let per_chunk_row = (batch_size as u64)
        .saturating_mul(num_heads as u64)
        .saturating_mul(num_patches as u64)
        .saturating_mul(4);
    if per_chunk_row == 0 {
        return 0;
    }
    let max_chunk = DISPATCH_LIMIT_BYTES / per_chunk_row;
    if max_chunk >= num_patches as u64 {
        0 // full attention fits — no chunking needed
    } else {
        // Round down to the nearest multiple of 64 for alignment, minimum 16.
        let c = (max_chunk as usize / 64) * 64;
        c.max(16)
    }
}

/// Compute the largest batch size whose **all-layers** attention tape fits
/// within `limit_gb`.
///
/// Uses the exact `N²` formula (independent of chunk_size):
/// `all_layers = depth × 2 × B × H × N × N × 4`
/// → `B_max = limit_bytes / (depth × 2 × H × N² × 4)`
fn max_safe_batch(depth: usize, num_heads: usize, num_patches: usize, limit_gb: f64) -> usize {
    let limit_bytes = (limit_gb * (1u64 << 30) as f64) as u64;
    let per_sample = depth as u64
        * 2
        * num_heads as u64
        * num_patches as u64
        * num_patches as u64
        * 4;
    if per_sample == 0 {
        return usize::MAX;
    }
    (limit_bytes / per_sample).max(1) as usize
}

/// Train a SensorLM model.
///
/// Replace [`SyntheticSensorDataset`] with a real [`CsvSensorDataset`] in
/// production.
pub fn train<B: AutodiffBackend>(
    mut model_cfg: SensorLMConfig,
    mut train_cfg: TrainingConfig,
) -> Result<()>
where
    B::Device: Clone + Default + Send + Sync + std::fmt::Debug + 'static,
    B::InnerBackend: burn::tensor::backend::Backend<Device = B::Device>,
{
    // -----------------------------------------------------------------------
    // Pre-flight memory guard
    //
    // KEY FACT: Burn's forward pass builds autodiff tape for EVERY transformer
    // layer before loss.backward() executes.  At the forward→backward boundary
    // ALL `depth` layers' chunk tensors (attention scores + softmax weights)
    // are simultaneously in GPU memory.  The correct metric is therefore
    // `all_layers_bwd`, NOT `per_layer_bwd`.
    //
    // Memory layout at peak (end of forward pass):
    //   all_layers_attn  = depth × 2 × ceil(N/chunk) × B × H × chunk × N × 4
    //   static           ≈ params × 16 bytes  (weights + grad + Adam m1/m2)
    //   other_activations ≈ depth × B × N × d × 12 bytes  (Q/K/V + MLP; small)
    //
    // We budget ATTN_VRAM_FRACTION (70%) of VRAM for attention, leaving 30%
    // for static + activations.  Without --vram-gb we use ALL_LAYERS_LIMIT_GB
    // (calibrated for 16 GB GPUs).
    // -----------------------------------------------------------------------
    let num_patches = model_cfg.sensor_encoder.num_patches();

    // ---- derive attention budget from VRAM --------------------------------
    let attn_limit_gb: f64 = match train_cfg.vram_gb {
        Some(vram) => {
            let limit = vram * ATTN_VRAM_FRACTION;
            eprintln!(
                "[sensorlm] VRAM budget: {vram:.0} GB \
                 → attention limit: {limit:.2} GB (= VRAM × {ATTN_VRAM_FRACTION})"
            );
            limit
        }
        None => ALL_LAYERS_LIMIT_GB,
    };

    // ---- auto-cap batch_size when --vram-gb was given --------------------
    if train_cfg.vram_gb.is_some() {
        let safe = max_safe_batch(
            model_cfg.sensor_encoder.depth,
            model_cfg.sensor_encoder.num_heads,
            num_patches,
            attn_limit_gb,
        );
        if train_cfg.batch_size > safe {
            eprintln!(
                "[sensorlm] Auto-reducing batch_size {} → {safe} \
                 (largest that fits in {attn_limit_gb:.2} GB attention budget).",
                train_cfg.batch_size,
            );
            train_cfg.batch_size = safe;
        } else {
            eprintln!(
                "[sensorlm] batch_size={} fits  (max safe for this VRAM: {safe}).",
                train_cfg.batch_size,
            );
        }
    }

    // ---- auto-tune chunk_size to minimise GPU command-buffer submissions --
    //
    // With B=2 and chunk=64 the chunked attention produces
    // ceil(2448/64)=39 chunks × 3 ops × 12 layers = 1 404 GPU submissions
    // per forward pass.  WGPU submits each as a separate Metal command buffer;
    // with a tiny dispatch (14 MB) the GPU idles between submissions causing
    // "Device::maintain: waiting for submission index N" spam and very slow
    // throughput.
    //
    // After the batch is fixed we pick the LARGEST chunk that keeps each
    // dispatch ≤ 512 MB (safe on Metal).  At B=2 this is full attention
    // (0 = no chunking), reducing submissions from 1 404 → 36 per forward.
    {
        let new_chunk = optimal_chunk_size(
            train_cfg.batch_size,
            model_cfg.sensor_encoder.num_heads,
            num_patches,
        );
        let old_chunk = model_cfg.sensor_encoder.attn_chunk_size;
        if new_chunk != old_chunk {
            let old_subs = if old_chunk == 0 { 1 } else { num_patches.div_ceil(old_chunk) };
            let new_subs = if new_chunk == 0 { 1 } else { num_patches.div_ceil(new_chunk) };
            eprintln!(
                "[sensorlm] Auto-tuning attn_chunk_size {old_chunk} → {new_chunk} \
                 ({old_subs} → {new_subs} GPU submissions/layer, \
                 dispatch ≤ {} MB).",
                DISPATCH_LIMIT_BYTES / (1024 * 1024),
            );
            model_cfg.sensor_encoder.attn_chunk_size = new_chunk;
        }
    }
    // Re-borrow enc after mutating model_cfg.
    let enc = &model_cfg.sensor_encoder;

    // ---- compute estimates for the (possibly adjusted) batch_size --------
    let mem = estimate_attn_memory(
        train_cfg.batch_size,
        enc.depth,
        enc.num_heads,
        num_patches,
        enc.attn_chunk_size,
    );
    let gb = |b: u64| b as f64 / (1024.0_f64.powi(3));
    let dispatch_gb   = gb(mem.per_dispatch_bytes);
    let per_layer_gb  = gb(mem.per_layer_bwd_bytes);
    let all_layers_gb = gb(mem.all_layers_bwd_bytes);

    eprintln!(
        "[sensorlm] Sensor encoder: N={num_patches} patches, \
         depth={}, heads={}, chunk_size={}, batch={}",
        enc.depth, enc.num_heads, enc.attn_chunk_size, train_cfg.batch_size,
    );
    eprintln!("[sensorlm] Attention VRAM (score/weight tensors only; add ~1–2 GB for weights+Adam+activations):");
    eprintln!("[sensorlm]   per GPU dispatch : {dispatch_gb:.3} GB  (TDR risk if > {PER_DISPATCH_WARN_GB} GB)");
    eprintln!("[sensorlm]   per layer tape   : {per_layer_gb:.2} GB  × {} layers", enc.depth);
    eprintln!("[sensorlm]   ALL layers peak  : {all_layers_gb:.2} GB  ← actual training peak  (limit: {attn_limit_gb:.2} GB)");

    // ---- soft TDR warning ------------------------------------------------
    if dispatch_gb > PER_DISPATCH_WARN_GB {
        eprintln!(
            "[sensorlm] ⚠  Per-dispatch ({dispatch_gb:.2} GB) > {PER_DISPATCH_WARN_GB} GB — \
             GPU watchdog (TDR) risk. Reduce attn_chunk_size (current: {}).",
            enc.attn_chunk_size,
        );
    }

    // ---- hard guard on ALL-layers peak -----------------------------------
    if all_layers_gb > attn_limit_gb {
        let safe_batch = max_safe_batch(
            enc.depth,
            enc.num_heads,
            num_patches,
            attn_limit_gb,
        );
        let safe_chunk = (enc.attn_chunk_size / 2).max(16);
        let vram_hint = if train_cfg.vram_gb.is_none() {
            "Specify your GPU memory with --vram-gb <GB> to auto-select the \
             right batch size, or pass --no-vram-check to skip this guard."
                .to_string()
        } else {
            format!("Pass --no-vram-check to proceed despite the estimate, or lower --batch-size to {safe_batch}.")
        };

        let msg = format!(
            "All-layers attention peak ({all_layers_gb:.2} GB) exceeds \
             the budget ({attn_limit_gb:.2} GB).\n\
             \n\
             WHY: Burn builds autodiff tape for all {depth} transformer layers \
             during the forward pass.  At the forward→backward boundary all \
             {depth} layers' chunk tensors are simultaneously in GPU memory — \
             the peak is depth × per-layer, not just per-layer.\n\
             \n\
             Largest safe batch for this model + VRAM: {safe_batch}\n\
             \n\
             Options:\n\
             • --vram-gb <GB>       tell the tool your GPU — batch auto-selected\n\
             • --batch-size {safe_batch:<4}      largest batch that fits\n\
             • --model-size tiny    ~11 M params, much lower attention memory\n\
             • --model-size small   ~44 M params, moderate memory\n\
             • attn_chunk_size {safe_chunk}  halving chunk halves per-layer tape\n\
             • --no-vram-check      bypass guard (crashes are your responsibility)\n\
             \n\
             {vram_hint}",
            depth = enc.depth,
        );

        if train_cfg.skip_vram_check {
            eprintln!("[sensorlm] ⚠  Guard exceeded but --no-vram-check set:\n{msg}");
            eprintln!("[sensorlm] ⚠  Proceeding — monitor GPU memory carefully.");
        } else {
            return Err(crate::error::SensorLMError::Other(anyhow::anyhow!("{msg}")));
        }
    }

    let device = B::Device::default();
    let max_seq_len = train_cfg.caption_key.max_tokens();

    // -----------------------------------------------------------------------
    // Datasets (synthetic – replace with CsvSensorDataset for real data)
    // -----------------------------------------------------------------------
    let train_samples = train_cfg.batch_size * 20;
    let valid_samples = train_cfg.batch_size * 4;

    let train_dataset = SyntheticSensorDataset::new(train_samples, train_cfg.seed, max_seq_len);
    let valid_dataset = SyntheticSensorDataset::new(valid_samples, train_cfg.seed + 1, max_seq_len);

    // -----------------------------------------------------------------------
    // Step counts
    // -----------------------------------------------------------------------
    let num_workers = train_cfg.num_workers.max(1);
    let train_steps = train_samples / train_cfg.batch_size;
    let valid_steps = valid_samples / train_cfg.batch_size;

    eprintln!(
        "[sensorlm] Training plan: {train_steps} train steps + \
         {valid_steps} validation steps per epoch  \
         (dataset: {train_samples} train / {valid_samples} valid samples)"
    );

    // -----------------------------------------------------------------------
    // Batchers
    // -----------------------------------------------------------------------
    let batcher_train = SensorLMBatcher::<B>::new(
        device.clone(),
        model_cfg.sensor_encoder.time_steps,
        model_cfg.sensor_encoder.num_channels,
        max_seq_len,
    );
    let batcher_valid = SensorLMBatcher::<B::InnerBackend>::new(
        device.clone(),
        model_cfg.sensor_encoder.time_steps,
        model_cfg.sensor_encoder.num_channels,
        max_seq_len,
    );

    // Burn's PartialDataset::split divides by num_workers — 0 would panic.
    let train_loader = DataLoaderBuilder::new(batcher_train)
        .batch_size(train_cfg.batch_size)
        .shuffle(train_cfg.seed)
        .num_workers(num_workers)
        .build(train_dataset);

    let valid_loader = DataLoaderBuilder::new(batcher_valid)
        .batch_size(train_cfg.batch_size)
        .num_workers(num_workers)
        .build(valid_dataset);

    // -----------------------------------------------------------------------
    // Model and optimiser
    // -----------------------------------------------------------------------
    let model = SensorLMModel::<B>::new(&model_cfg, &device);

    let optimizer = AdamConfig::new()
        .with_beta_1(train_cfg.beta1 as f32)
        .with_beta_2(train_cfg.beta2 as f32)
        .with_epsilon(train_cfg.epsilon as f32)
        .with_weight_decay(Some(burn::optim::decay::WeightDecayConfig::new(
            train_cfg.weight_decay, // f64 penalty
        )))
        .init();

    // -----------------------------------------------------------------------
    // LR scheduler (rsqrt with warm-up and cool-down)
    // -----------------------------------------------------------------------
    let lr_scheduler = RsqrtScheduler::new(
        train_cfg.lr,
        train_cfg.total_steps,
        train_cfg.warmup_fraction,
        train_cfg.cooldown_fraction,
    );

    // -----------------------------------------------------------------------
    // Learner
    // -----------------------------------------------------------------------
    std::fs::create_dir_all(&train_cfg.artifact_dir)?;

    // SensorLMRenderer replaces Burn's default CliMetricsRenderer which
    // contains `dbg!(item)` calls that dump raw structs on every step.
    let renderer = SensorLMRenderer::new(train_steps, valid_steps);

    let builder = LearnerBuilder::new(&train_cfg.artifact_dir)
        .metric_train_numeric(LossMetric::<B>::new())
        .metric_valid_numeric(LossMetric::<B::InnerBackend>::new())
        .with_file_checkpointer(CompactRecorder::new())
        .renderer(renderer)
        .devices(vec![device])
        .num_epochs(1);

    let builder = if train_cfg.show_summary { builder.summary() } else { builder };

    let learner = builder.build(model, optimizer, lr_scheduler);

    let _trained_model = learner.fit(train_loader, valid_loader);

    eprintln!(
        "\n[sensorlm] Training complete — \
         {train_steps} train + {valid_steps} valid steps."
    );
    Ok(())
}

/// Save a trained model to disk using full-precision binary format.
pub fn save_model<B: AutodiffBackend>(
    model: SensorLMModel<B>,
    path: &Path,
) -> Result<()> {
    let recorder = BinFileRecorder::<FullPrecisionSettings>::new();
    model
        .save_file(path, &recorder)
        .map_err(|e| crate::error::SensorLMError::Other(anyhow::anyhow!("{e}")))?;
    Ok(())
}

/// Load a model from a checkpoint saved with [`save_model`].
pub fn load_model<B: AutodiffBackend>(
    cfg: &SensorLMConfig,
    path: &Path,
    device: &B::Device,
) -> Result<SensorLMModel<B>> {
    let recorder = BinFileRecorder::<FullPrecisionSettings>::new();
    let model = SensorLMModel::<B>::new(cfg, device)
        .load_file(path, &recorder, device)
        .map_err(|e| crate::error::SensorLMError::Other(anyhow::anyhow!("{e}")))?;
    Ok(model)
}