aprender-core 0.50.0

//! Tests for nn::optim optimizers
//! PMAT-085: Extracted from optim.rs for file health

pub(crate) use super::*;
pub(crate) use crate::autograd::clear_graph;

#[test]
fn test_sgd_basic() {
    clear_graph();

    // Create a simple tensor
    let mut param = Tensor::from_slice(&[1.0, 2.0, 3.0]).requires_grad();
    let param_id = param.id();

    // Simulate a loss: sum of squared elements
    let loss = param.pow(2.0).sum();
    loss.backward();

    // Check gradient exists
    let grad = get_grad(param_id).expect("Should have gradient");
    assert_eq!(grad.data(), &[2.0, 4.0, 6.0]); // d/dx(x²) = 2x

    // Create optimizer and step
    let mut sgd = SGD::new(vec![&mut param], 0.1);
    sgd.step_with_params(&mut [&mut param]);

    // param = param - lr * grad = [1, 2, 3] - 0.1 * [2, 4, 6] = [0.8, 1.6, 2.4]
    let expected = [0.8, 1.6, 2.4];
    for (p, e) in param.data().iter().zip(expected.iter()) {
        assert!((p - e).abs() < 1e-5, "Expected {e}, got {p}");
    }
}

#[test]
fn test_sgd_with_momentum() {
    clear_graph();

    let mut param = Tensor::from_slice(&[1.0]).requires_grad();

    // First step
    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut sgd = SGD::with_momentum(vec![&mut param], 0.1, 0.9);
    sgd.step_with_params(&mut [&mut param]);

    // v = 0.9 * 0 + 2.0 = 2.0
    // param = 1.0 - 0.1 * 2.0 = 0.8
    assert!((param.data()[0] - 0.8).abs() < 1e-5);

    // Second step
    clear_graph();
    let loss = param.pow(2.0).sum();
    loss.backward();

    sgd.step_with_params(&mut [&mut param]);

    // grad = 2 * 0.8 = 1.6
    // v = 0.9 * 2.0 + 1.6 = 3.4
    // param = 0.8 - 0.1 * 3.4 = 0.46
    assert!((param.data()[0] - 0.46).abs() < 1e-5);
}

#[test]
fn test_adam_basic() {
    clear_graph();

    let mut param = Tensor::from_slice(&[1.0, 2.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut adam = Adam::new(vec![&mut param], 0.1);
    adam.step_with_params(&mut [&mut param]);

    // After one step, params should decrease
    assert!(param.data()[0] < 1.0);
    assert!(param.data()[1] < 2.0);
}

#[test]
fn test_adam_convergence() {
    // Test that Adam can minimize a simple quadratic
    clear_graph();

    let mut param = Tensor::from_slice(&[5.0]).requires_grad();
    let mut adam = Adam::new(vec![&mut param], 0.5);

    // Minimize x² (optimal at x=0)
    for _ in 0..100 {
        clear_graph();
        let loss = param.pow(2.0).sum();
        loss.backward();
        adam.step_with_params(&mut [&mut param]);
    }

    // Should be close to 0
    assert!(
        param.data()[0].abs() < 0.1,
        "Parameter should converge to 0, got {}",
        param.data()[0]
    );
}

#[test]
fn test_adamw_weight_decay() {
    clear_graph();

    let mut param = Tensor::from_slice(&[10.0]).requires_grad();

    // With zero gradient, only weight decay applies
    // We need a loss that has zero gradient at current point
    // Actually, let's just test the decoupled nature

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut adamw = AdamW::new(vec![&mut param], 0.1).weight_decay(0.1);
    adamw.step_with_params(&mut [&mut param]);

    // With weight decay, param should decrease more
    assert!(param.data()[0] < 10.0);
}

#[test]
fn test_rmsprop_basic() {
    clear_graph();

    let mut param = Tensor::from_slice(&[3.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1);
    rmsprop.step_with_params(&mut [&mut param]);

    // Param should decrease
    assert!(param.data()[0] < 3.0);
}

#[test]
fn test_zero_grad() {
    clear_graph();

    let mut param = Tensor::from_slice(&[1.0, 2.0]).requires_grad();
    let param_id = param.id();

    let loss = param.pow(2.0).sum();
    loss.backward();

    // Gradient should exist
    assert!(get_grad(param_id).is_some());

    // Zero grad
    let mut sgd = SGD::new(vec![&mut param], 0.1);
    sgd.zero_grad();

    // Gradient should be cleared
    assert!(get_grad(param_id).is_none());
}

#[test]
fn test_learning_rate_change() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut sgd = SGD::new(vec![&mut param], 0.1);

    assert!((sgd.lr() - 0.1).abs() < 1e-6);

    sgd.set_lr(0.01);
    assert!((sgd.lr() - 0.01).abs() < 1e-6);
}

#[test]
fn test_sgd_nesterov() {
    clear_graph();

    let mut param = Tensor::from_slice(&[2.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut sgd = SGD::with_momentum(vec![&mut param], 0.1, 0.9).nesterov();
    sgd.step_with_params(&mut [&mut param]);

    // Nesterov should apply a "look ahead" update
    // With nesterov: param = param - lr * (momentum * velocity + grad)
    // v = 0.9 * 0 + 4 = 4 (grad = 2 * 2 = 4)
    // param = 2 - 0.1 * (0.9 * 4 + 4) = 2 - 0.1 * 7.6 = 1.24
    assert!(
        (param.data()[0] - 1.24).abs() < 1e-5,
        "Nesterov update failed: {}",
        param.data()[0]
    );
}

#[test]
fn test_sgd_weight_decay() {
    clear_graph();

    let mut param = Tensor::from_slice(&[5.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut sgd = SGD::new(vec![&mut param], 0.1).weight_decay(0.1);
    sgd.step_with_params(&mut [&mut param]);

    // grad = 2 * 5 = 10, with weight_decay: g = 10 + 0.1 * 5 = 10.5
    // param = 5 - 0.1 * 10.5 = 3.95
    assert!(
        (param.data()[0] - 3.95).abs() < 1e-5,
        "Weight decay update failed: {}",
        param.data()[0]
    );
}

#[test]
fn test_adam_with_custom_betas() {
    clear_graph();

    let mut param = Tensor::from_slice(&[1.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut adam = Adam::new(vec![&mut param], 0.1).betas(0.8, 0.99);
    adam.step_with_params(&mut [&mut param]);

    // Param should decrease with custom betas
    assert!(param.data()[0] < 1.0);
}

#[test]
fn test_adam_with_eps() {
    clear_graph();

    let mut param = Tensor::from_slice(&[1.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut adam = Adam::new(vec![&mut param], 0.1).eps(1e-6);
    adam.step_with_params(&mut [&mut param]);

    assert!(param.data()[0] < 1.0);
}

#[test]
fn test_adam_with_weight_decay() {
    clear_graph();

    let mut param = Tensor::from_slice(&[10.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    // Compare with and without weight decay
    let mut adam_wd = Adam::new(vec![&mut param], 0.1).weight_decay(0.1);
    adam_wd.step_with_params(&mut [&mut param]);

    // With weight decay, the update should be larger
    assert!(param.data()[0] < 10.0);
}

#[test]
fn test_adamw_with_custom_betas_and_eps() {
    clear_graph();

    let mut param = Tensor::from_slice(&[3.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut adamw = AdamW::new(vec![&mut param], 0.1)
        .betas(0.85, 0.995)
        .eps(1e-7);
    adamw.step_with_params(&mut [&mut param]);

    assert!(param.data()[0] < 3.0);
}

#[test]
fn test_adamw_lr_methods() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut adamw = AdamW::new(vec![&mut param], 0.01);

    assert!((adamw.lr() - 0.01).abs() < 1e-6);
    adamw.set_lr(0.001);
    assert!((adamw.lr() - 0.001).abs() < 1e-6);
}

#[test]
fn test_adamw_zero_grad() {
    clear_graph();

    let mut param = Tensor::from_slice(&[2.0]).requires_grad();
    let param_id = param.id();

    let loss = param.pow(2.0).sum();
    loss.backward();

    assert!(get_grad(param_id).is_some());

    let mut adamw = AdamW::new(vec![&mut param], 0.1);
    adamw.zero_grad();

    assert!(get_grad(param_id).is_none());
}

#[test]
fn test_adamw_step_trait() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut adamw = AdamW::new(vec![&mut param], 0.1);

    // Test the Optimizer trait step method
    adamw.step();
    assert!(adamw.initialized);
    assert_eq!(adamw.t, 1);
}

#[test]
fn test_rmsprop_with_alpha() {
    clear_graph();

    let mut param = Tensor::from_slice(&[2.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1).alpha(0.9);
    rmsprop.step_with_params(&mut [&mut param]);

    assert!(param.data()[0] < 2.0);
}

#[test]
fn test_rmsprop_with_eps() {
    clear_graph();

    let mut param = Tensor::from_slice(&[2.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1).eps(1e-6);
    rmsprop.step_with_params(&mut [&mut param]);

    assert!(param.data()[0] < 2.0);
}

#[test]
fn test_rmsprop_with_momentum() {
    clear_graph();

    let mut param = Tensor::from_slice(&[3.0]).requires_grad();

    // First step
    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1).momentum(0.9);
    rmsprop.step_with_params(&mut [&mut param]);

    let after_first = param.data()[0];
    assert!(after_first < 3.0);

    // Second step with momentum accumulation
    clear_graph();
    let loss = param.pow(2.0).sum();
    loss.backward();

    rmsprop.step_with_params(&mut [&mut param]);

    assert!(param.data()[0] < after_first);
}

#[test]
fn test_rmsprop_with_weight_decay() {
    clear_graph();

    let mut param = Tensor::from_slice(&[5.0]).requires_grad();

    let loss = param.pow(2.0).sum();
    loss.backward();

    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1).weight_decay(0.1);
    rmsprop.step_with_params(&mut [&mut param]);

    assert!(param.data()[0] < 5.0);
}

#[test]
fn test_rmsprop_lr_methods() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut rmsprop = RMSprop::new(vec![&mut param], 0.01);

    assert!((rmsprop.lr() - 0.01).abs() < 1e-6);
    rmsprop.set_lr(0.001);
    assert!((rmsprop.lr() - 0.001).abs() < 1e-6);
}

#[test]
fn test_rmsprop_zero_grad() {
    clear_graph();

    let mut param = Tensor::from_slice(&[2.0]).requires_grad();
    let param_id = param.id();

    let loss = param.pow(2.0).sum();
    loss.backward();

    assert!(get_grad(param_id).is_some());

    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1);
    rmsprop.zero_grad();

    assert!(get_grad(param_id).is_none());
}

#[test]
fn test_rmsprop_step_trait() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut rmsprop = RMSprop::new(vec![&mut param], 0.1);

    rmsprop.step();
    assert!(rmsprop.initialized);
}

#[test]
fn test_sgd_step_trait() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut sgd = SGD::new(vec![&mut param], 0.1);

    sgd.step();
    assert!(sgd.initialized);
}

#[test]
fn test_adam_step_trait() {
    let mut param = Tensor::from_slice(&[1.0]).requires_grad();
    let mut adam = Adam::new(vec![&mut param], 0.1);

    adam.step();
    assert!(adam.initialized);
    assert_eq!(adam.t, 1);
}

#[path = "tests_adam.rs"]
mod tests_adam;
#[path = "tests_adamw_contract.rs"]
mod tests_adamw_contract;
#[path = "tests_large_tensors.rs"]
mod tests_large_tensors;
#[path = "tests_state_resize.rs"]
mod tests_state_resize;

/// REGRESSION (direct): after `Linear::forward` + `backward()`, the layer's WEIGHT
/// must receive a gradient — not just the bias. This is the exact defect: the
/// construction-time cached `weight_t` transpose edge is wiped by `clear_graph()`,
/// leaving `weight` with no path to gradient (`get_grad(weight.id())` == None).
/// Falsifies proof obligation GRAD-FLOW of `contracts/nn-training-gradient-path-v1.yaml`.
#[test]
fn nn_linear_backward_populates_weight_grad() {
    use crate::autograd::{clear_graph, get_grad, Tensor};
    use crate::nn::{Linear, Module};

    clear_graph(); // mimics a per-step training reset that wiped the cached edge
    let layer = Linear::with_seed(4, 3, Some(1));
    let x = Tensor::from_vec(vec![0.5; 8], &[2, 4]);
    let out = layer.forward(&x);
    out.sum().backward();

    assert!(
        get_grad(layer.weight().id()).is_some(),
        "Linear weight received NO gradient through forward — the autograd path to \
         `weight` is broken (cached construction-time transpose wiped by clear_graph)."
    );
    if let Some(b) = layer.bias() {
        assert!(get_grad(b.id()).is_some(), "Linear bias received no gradient either.");
    }
}

/// REGRESSION: a 2-layer MLP trained with the canonical idiom (clear_graph →
/// forward → backward → `SGD::step_with_params`) MUST converge.
///
/// Guards the Linear weight-gradient-path fix (linear.rs forward): a `Linear`
/// caches `weight_t = weight.transpose()` at construction, but a training loop's
/// per-step `clear_graph()` wipes that construction-time transpose edge. If
/// forward reuses the cached transpose, `weight` has no path to receive gradient
/// (`get_grad(weight.id())` is `None`), the optimizer updates only biases, and the
/// loss is nearly frozen. forward now re-derives the transpose from the live
/// weight while grad-tracking, so weights actually learn. Deterministic (seeded).
#[test]
fn nn_mlp_training_converges() {
    use crate::autograd::{clear_graph, Tensor};
    use crate::nn::{Linear, MSELoss, Module, ReLU, Sequential, SGD};

    let (n, din, dh) = (256usize, 16usize, 8usize);
    let mut s: u64 = 7;
    let mut rng = || {
        s ^= s << 13;
        s ^= s >> 7;
        s ^= s << 17;
        ((s >> 40) as f32 / (1u64 << 24) as f32) - 0.5
    };
    let w: Vec<f32> = (0..din).map(|_| rng()).collect();
    let (mut xd, mut yd) = (Vec::new(), Vec::new());
    for _ in 0..n {
        let mut t = 0.0;
        for j in 0..din {
            let v = rng();
            xd.push(v);
            t += v * w[j];
        }
        yd.push(t);
    }
    let x = Tensor::from_vec(xd, &[n, din]);
    let y = Tensor::from_vec(yd, &[n, 1]);
    let mut model = Sequential::new()
        .add(Linear::with_seed(din, dh, Some(1)))
        .add(ReLU)
        .add(Linear::with_seed(dh, 1, Some(2)));
    let crit = MSELoss::new();
    let mut sgd = SGD::new(model.parameters_mut(), 0.1);

    let mut first = 0.0;
    let mut last = 0.0;
    for step in 0..500 {
        clear_graph();
        let loss = crit.forward(&model.forward(&x), &y);
        last = loss.item();
        if step == 0 {
            first = last;
        }
        loss.backward();
        let mut p = model.parameters_mut();
        sgd.step_with_params(&mut p);
    }

    assert!(
        first > 0.05,
        "sanity: initial MSE should be non-trivial, got {first}"
    );
    // Weights-frozen (the defect) gives only ~50% drop (biases only); a working
    // weight gradient path drops >98%. 80% is a robust, non-flaky threshold.
    assert!(
        last < first * 0.2,
        "MLP training did not converge: MSE {first:.5} -> {last:.5} (need < {:.5}). \
         Likely a regression of the Linear live-transpose gradient-path fix.",
        first * 0.2
    );
}