tokitai-operator 0.1.0

Verified DL kernel compiler: formally-checked GEMM, p-adic, sheaf, contract-carrying ops. Paper-artifact grade.
Documentation
//! Quality decision dataset generator.
//!
//! Mirrors the input/output schema of the real `tokitai-search`
//! `QualityDecisionRecord -> QualityOutcomeRecord` join (see
//! `crates/training/` in the parent repo):
//!
//! - Input is 96-dim: a concatenation of
//!   - the 74-dim categorical one-hot (9+11+14+7+5+16+8+4), and
//!   - the 22-dim numerical vector (milli-unit, mostly `i16`).
//! - Output is 20-dim: a concatenation of
//!   - the 12-way `outcome_kind` one-hot, and
//!   - 8 aux metric regression targets (in `[0, 1]`).
//!
//! The 74 categorical dims are encoded as **soft** one-hots: we
//! allow up to 2 active bits per block (the integer index in `0..D`
//! gets a `1.0`, and an adjacent bit is sometimes flipped to `0.7`
//! to model label noise / fuzzy boundaries). This keeps the f32
//! interface simple while preserving the "one-hot-like" structure
//! the router/embedding layers expect.
//!
//! The 22 numerical dims are drawn i.i.d. uniform on `[0, 1]`
//! (the real `extract_quality_signals.rs` normalizes milli-units
//! to [0, 1] before training), with a small per-sample bias so
//! adjacent samples have non-trivial correlation.
//!
//! ## Per-expert non-linear generation
//!
//! A 4-way linear router projection `r = R . x_collapsed` picks the
//! "true" expert (argmax of r). `x_collapsed` is the 8-dim
//! projection of the 96-dim input through a fixed 96x8 matrix
//! `P_routed` (so the router sees a stable low-dim view of the
//! features). Each expert `e in 0..4` owns:
//! - a `12 x 8` outcome-projection matrix `W_outcome_e`, and
//! - a `8 x 8` aux-projection matrix `W_aux_e`,
//!
//! The outcome target is `softmax(W_outcome_e . x_collapsed + b_e)`
//! (12-dim softmax over the 12 outcome classes) — this is the
//! non-linearity. The aux target is `sigmoid(W_aux_e . x_collapsed +
//! b_aux_e)` clipped to `[0, 1]`.

use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;

use crate::synth_data::{QUALITY_INPUT_DIM, QUALITY_OUTPUT_DIM};

/// Number of categorical one-hot dimensions (matches
/// `tokitai-search::crates::training::CATEGORICAL_DIMS`).
pub const CATEGORICAL_DIMS: usize = 74;
/// Number of numerical dimensions (matches
/// `tokitai-search::crates::training::NUMERICAL_DIMS`).
pub const NUMERICAL_DIMS: usize = 22;
/// Number of outcome-class one-hot dimensions.
pub const OUTCOME_DIMS: usize = 12;
/// Number of aux metric regression targets.
pub const AUX_DIMS: usize = 8;
/// Number of experts in the synthetic generation process.
pub const N_EXPERTS: usize = 4;
/// Dimensionality of the collapsed feature view used by the router
/// and the per-expert projections.
const ROUTED_DIM: usize = 8;

/// One synthetic quality-decision sample.
#[derive(Debug, Clone)]
pub struct QualitySample {
    /// 96-dim input vector (74 categorical one-hot + 22 numerical).
    pub input: Vec<f32>,
    /// 20-dim target vector (12 outcome one-hot + 8 aux metrics).
    pub target: Vec<f32>,
    /// The "true" expert assignment in `0..N_EXPERTS`. Useful for
    /// debugging the MoE router and for stratified sampling.
    pub expert_assignment: usize,
    /// Sequential sample index (0..n_samples).
    pub index: usize,
}

impl QualitySample {
    /// Compile-time check that the public input/output widths match
    /// the constants in `mod.rs`.
    pub fn shape_ok(&self) -> bool {
        self.input.len() == QUALITY_INPUT_DIM && self.target.len() == QUALITY_OUTPUT_DIM
    }
}

/// Build a deterministic quality-decision dataset.
///
/// # Arguments
/// - `n_samples`: how many samples to generate.
/// - `seed`: 64-bit seed. The same `(n_samples, seed)` always
///   produces the same dataset (modulo the public
///   `regression_dataset_is_deterministic` invariant).
///
/// # Returns
/// A vector of `n_samples` [`QualitySample`]s, each with
/// `input.len() == 96` and `target.len() == 20`.
pub fn make_quality_decision_dataset(n_samples: usize, seed: u64) -> Vec<QualitySample> {
    if n_samples == 0 {
        return Vec::new();
    }

    let mut rng = StdRng::seed_from_u64(seed);

    // Fixed router projection: P_routed is 96 x 8. The collapsed
    // view is what the synthetic router (and each expert) sees.
    // P_routed entries ~ N(0, 1/sqrt(96)) (He-ish for a 96-wide
    // input).
    let p_routed = gauss_matrix(&mut rng, QUALITY_INPUT_DIM, ROUTED_DIM, 1.0 / 96f32.sqrt());

    // Fixed router R: ROUTED_DIM x N_EXPERTS, entries ~ N(0, 1).
    let router_w = gauss_matrix(&mut rng, ROUTED_DIM, N_EXPERTS, 1.0);

    // Per-expert outcome projection: N_EXPERTS matrices, each
    // (OUTCOME_DIMS x ROUTED_DIM), entries ~ N(0, 1).
    let mut outcome_w: Vec<Vec<f32>> = Vec::with_capacity(N_EXPERTS);
    let mut outcome_b: Vec<f32> = Vec::with_capacity(N_EXPERTS * OUTCOME_DIMS);
    for _ in 0..N_EXPERTS {
        outcome_w.push(gauss_matrix(&mut rng, OUTCOME_DIMS, ROUTED_DIM, 1.0));
        for _ in 0..OUTCOME_DIMS {
            outcome_b.push(gauss(&mut rng, 0.0, 0.1));
        }
    }

    // Per-expert aux projection: N_EXPERTS matrices, each
    // (AUX_DIMS x ROUTED_DIM), entries ~ N(0, 1).
    let mut aux_w: Vec<Vec<f32>> = Vec::with_capacity(N_EXPERTS);
    let mut aux_b: Vec<f32> = Vec::with_capacity(N_EXPERTS * AUX_DIMS);
    for _ in 0..N_EXPERTS {
        aux_w.push(gauss_matrix(&mut rng, AUX_DIMS, ROUTED_DIM, 1.0));
        for _ in 0..AUX_DIMS {
            aux_b.push(gauss(&mut rng, 0.0, 0.1));
        }
    }

    // Sample bias applied to each numerical feature so adjacent
    // samples in the stream have non-trivial correlation. A scalar
    // per numerical dim, drawn once.
    let num_bias: Vec<f32> = (0..NUMERICAL_DIMS)
        .map(|_| gauss(&mut rng, 0.0, 50.0))
        .collect();

    let mut out: Vec<QualitySample> = Vec::with_capacity(n_samples);
    for idx in 0..n_samples {
        // 1) Build the 74-dim categorical one-hot.
        // We pick one index per block (8 blocks) and set a strong
        // 1.0 there; with 15% probability we also flip an adjacent
        // index to 0.7 (label noise / soft boundary).
        let block_dims: [usize; 8] = [9, 11, 14, 7, 5, 16, 8, 4];
        let mut cat: Vec<f32> = vec![0.0f32; CATEGORICAL_DIMS];
        let mut off = 0usize;
        for &d in &block_dims {
            let primary = rng.gen_range(0..d);
            cat[off + primary] = 1.0;
            if d > 1 && rng.gen_range(0.0f32..1.0f32) < 0.15 {
                let secondary = (primary + 1) % d;
                cat[off + secondary] = 0.7;
            }
            off += d;
        }
        debug_assert_eq!(off, CATEGORICAL_DIMS);

        // 2) Build the 22-dim numerical vector normalized to [0, 1].
        let mut num: Vec<f32> = Vec::with_capacity(NUMERICAL_DIMS);
        for j in 0..NUMERICAL_DIMS {
            // Base draw on [0, 1000] milli-units, plus per-feature
            // bias and a small jitter; divide by 1000 to match the
            // real extract_quality_signals normalization.
            let base: f32 = rng.gen_range(0.0f32..1000.0f32);
            let jitter: f32 = gauss(&mut rng, 0.0, 5.0);
            let v = ((base + num_bias[j] + jitter) / 1000.0f32).clamp(0.0, 1.0);
            num.push(v);
        }

        // 3) Concatenate categorical and numerical into the 96-dim
        //    input.
        let mut input: Vec<f32> = Vec::with_capacity(QUALITY_INPUT_DIM);
        input.extend_from_slice(&cat);
        input.extend_from_slice(&num);
        debug_assert_eq!(input.len(), QUALITY_INPUT_DIM);

        // 4) Compute the collapsed 8-dim view via P_routed.
        let x_collapsed = mat_vec(&p_routed, &input, QUALITY_INPUT_DIM, ROUTED_DIM);

        // 5) Pick the true expert via the 4-way router projection.
        let r = mat_vec(&router_w, &x_collapsed, ROUTED_DIM, N_EXPERTS);
        let expert = argmax(&r);

        // 6) Build the 12-dim outcome target with softmax. Add a
        //    small Gaussian jitter to the logits so the Brier-MSE
        //    head has a meaningful calibration target (not a
        //    deterministic one-hot the model can memorize).
        let w_e = &outcome_w[expert];
        let b_e = &outcome_b[expert * OUTCOME_DIMS..(expert + 1) * OUTCOME_DIMS];
        let mut logits = mat_vec(w_e, &x_collapsed, ROUTED_DIM, OUTCOME_DIMS);
        for j in 0..OUTCOME_DIMS {
            logits[j] += b_e[j] + gauss(&mut rng, 0.0, 0.1);
        }
        let outcome_soft = softmax(&logits);

        // 7) Build the 8-dim aux target with sigmoid. Flip the
        //    target with probability 0.05 so the aux regression
        //    head isn't deterministic.
        let w_aux_e = &aux_w[expert];
        let b_aux_e = &aux_b[expert * AUX_DIMS..(expert + 1) * AUX_DIMS];
        let mut aux_logits = mat_vec(w_aux_e, &x_collapsed, ROUTED_DIM, AUX_DIMS);
        for j in 0..AUX_DIMS {
            aux_logits[j] += b_aux_e[j];
        }
        let aux_target: Vec<f32> = aux_logits
            .iter()
            .map(|v| {
                let p = sigmoid(*v);
                let p = if rng.gen_range(0.0f32..1.0f32) < 0.05 {
                    1.0 - p
                } else {
                    p
                };
                p.clamp(0.0, 1.0)
            })
            .collect();

        // 8) Concatenate outcome + aux into the 20-dim target.
        let mut target: Vec<f32> = Vec::with_capacity(QUALITY_OUTPUT_DIM);
        target.extend_from_slice(&outcome_soft);
        target.extend_from_slice(&aux_target);
        debug_assert_eq!(target.len(), QUALITY_OUTPUT_DIM);

        out.push(QualitySample {
            input,
            target,
            expert_assignment: expert,
            index: idx,
        });
    }
    out
}

// ----- helpers -----

/// Sample one N(mean, std) draw using Box-Muller.
#[inline]
fn gauss<R: Rng>(rng: &mut R, mean: f32, std: f32) -> f32 {
    let u1: f32 = rng.gen_range((1.0e-7f32)..1.0f32);
    let u2: f32 = rng.gen_range(0.0f32..1.0f32);
    let z0 = (-2.0f32 * u1.ln()).sqrt() * (2.0f32 * std::f32::consts::PI * u2).cos();
    mean + std * z0
}

/// Build a `rows x cols` matrix with i.i.d. N(0, std) entries.
fn gauss_matrix<R: Rng>(rng: &mut R, rows: usize, cols: usize, std: f32) -> Vec<f32> {
    let mut m = Vec::with_capacity(rows * cols);
    for _ in 0..(rows * cols) {
        m.push(gauss(rng, 0.0, std));
    }
    m
}

/// `y = M . x` where `M` is `rows x cols` row-major and `x.len() == cols`.
fn mat_vec(m: &[f32], x: &[f32], cols: usize, rows: usize) -> Vec<f32> {
    let mut y = vec![0.0f32; rows];
    for j in 0..rows {
        let mut acc = 0.0f32;
        for i in 0..cols {
            acc += m[j * cols + i] * x[i];
        }
        y[j] = acc;
    }
    y
}

/// Numerically stable softmax over a slice. Returns a fresh vector.
fn softmax(logits: &[f32]) -> Vec<f32> {
    let max = logits.iter().fold(f32::NEG_INFINITY, |m, v| m.max(*v));
    let mut out: Vec<f32> = logits.iter().map(|v| (v - max).exp()).collect();
    let s: f32 = out.iter().sum();
    if s > 0.0 {
        for v in out.iter_mut() {
            *v /= s;
        }
    } else {
        // Degenerate (all -inf) -> uniform.
        let u = 1.0f32 / (out.len() as f32);
        for v in out.iter_mut() {
            *v = u;
        }
    }
    out
}

/// Sigmoid in fp32. Saturates to 0/1 outside +/-40 to avoid NaNs.
fn sigmoid(x: f32) -> f32 {
    if x > 40.0 {
        1.0
    } else if x < -40.0 {
        0.0
    } else {
        1.0 / (1.0 + (-x).exp())
    }
}

/// Index of the maximum element. Stable on ties via "first wins".
fn argmax(xs: &[f32]) -> usize {
    let mut best = 0usize;
    let mut best_v = f32::NEG_INFINITY;
    for (i, v) in xs.iter().enumerate() {
        if *v > best_v {
            best_v = *v;
            best = i;
        }
    }
    best
}