tokitai-operator 0.1.0

//! Linear regression dataset generator.
//!
//! Generates `n_samples` pairs `(x, y)` with:
//! - `x in R^{in_dim}`, drawn i.i.d. uniform on `[-0.5, 0.5]`
//! - `y in R^{out_dim}` defined as `y = X w_star + epsilon`
//! - `w_star` is a fixed `in_dim x out_dim` matrix, deterministic from
//!   the seed, with entries drawn from `N(0, 1 / sqrt(in_dim))`
//!   (He-ish initialization to keep `y` roughly unit-scaled).
//! - `epsilon` is small Gaussian noise with stddev `1e-2` so the
//!   closed-form linear regression is a true solve, not a
//!   memorization.
//!
//! The dataset is designed so that an off-the-shelf linear
//! regression (closed form) recovers `w_star` to within
//! `O(epsilon * sqrt(in_dim / n_samples))`. The smoke test in
//! `tests/synth_data_smoke.rs::regression_dataset_solves_with_linear_regression`
//! verifies this contract.

use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;

/// One `(input, target)` pair for linear regression.
///
/// `Vec<f32>` is used for ergonomics in tests and small toy
/// training loops. The training step driver in `src/training/` already
/// consumes `Vec<f32>` activations end-to-end.
pub type RegressionSample = (Vec<f32>, Vec<f32>);

/// Build a linear regression dataset of the form `y = X w_star + eps`.
///
/// # Arguments
/// - `n_samples`: number of rows. Should be `> in_dim` for the
///   closed-form solve to be well-conditioned.
/// - `in_dim`: input feature dimensionality (columns of `X`).
/// - `out_dim`: output dimensionality (columns of `Y`).
/// - `seed`: 64-bit seed for the deterministic RNG. The same
///   `(n_samples, in_dim, out_dim, seed)` triple always produces the
///   same dataset.
///
/// # Returns
/// A vector of `n_samples` `(x, y)` pairs. `x.len() == in_dim` and
/// `y.len() == out_dim` for every pair.
pub fn make_regression_dataset(
    n_samples: usize,
    in_dim: usize,
    out_dim: usize,
    seed: u64,
) -> Vec<RegressionSample> {
    if n_samples == 0 {
        return Vec::new();
    }
    assert!(in_dim >= 1, "in_dim must be >= 1");
    assert!(out_dim >= 1, "out_dim must be >= 1");

    let mut rng = StdRng::seed_from_u64(seed);

    // Draw w_star with He-ish variance: N(0, 1/sqrt(in_dim)) using
    // a small Box-Muller helper. We avoid pulling in `rand_distr` so
    // the only new dependency is `rand = "0.8"`.
    let w_std = 1.0f32 / (in_dim as f32).sqrt();
    let mut w_star: Vec<f32> = Vec::with_capacity(in_dim * out_dim);
    for _ in 0..(in_dim * out_dim) {
        w_star.push(gauss(&mut rng, 0.0f32, w_std));
    }

    // Tiny Gaussian noise on the targets so the closed form is a
    // real least-squares problem, not an interpolation.
    let noise_std = 1e-2f32;

    let mut out: Vec<RegressionSample> = Vec::with_capacity(n_samples);
    for _ in 0..n_samples {
        // x ~ U(-0.5, 0.5).
        let mut x: Vec<f32> = Vec::with_capacity(in_dim);
        for _ in 0..in_dim {
            x.push(rng.gen_range(-0.5f32..0.5f32));
        }

        // y = X . w_star + eps.
        let mut y: Vec<f32> = vec![0.0f32; out_dim];
        for j in 0..out_dim {
            let mut acc = 0.0f32;
            for i in 0..in_dim {
                acc += x[i] * w_star[j * in_dim + i];
            }
            y[j] = acc + gauss(&mut rng, 0.0f32, noise_std);
        }

        out.push((x, y));
    }
    out
}

/// Sample one N(mean, std) draw using the Box-Muller transform.
///
/// We use only the first draw of each pair; the second is dropped.
/// This is O(1) random uniforms per sample and good enough for
/// dataset generation.
#[inline]
fn gauss<R: Rng>(rng: &mut R, mean: f32, std: f32) -> f32 {
    // Avoid log(0) by clamping the uniform draw away from 0.
    let u1: f32 = rng.gen_range((1.0e-7f32)..1.0f32);
    let u2: f32 = rng.gen_range(0.0f32..1.0f32);
    let z0 = (-2.0f32 * u1.ln()).sqrt() * (2.0f32 * std::f32::consts::PI * u2).cos();
    mean + std * z0
}