keyhog-scanner 0.5.40

//! Mixture-of-experts model weights for the ML secret scorer.
//!
//! Architecture: gate Linear(42,6) → Softmax plus 6 experts of
//! Linear(42,32) → ReLU → Linear(32,16) → ReLU → Linear(16,1)
//!
//! Generated by: `python3 ml/train_classifier.py --features 42 --write`
//! Serialized as little-endian `f32` values so parsing is deterministic across
//! host architectures. The 42nd input is the decode-structure feature (see
//! `ml_features.rs`), which lets the model discount base64/hex-wrapped binary.

include!(concat!(env!("OUT_DIR"), "/model_version.rs"));

const WEIGHTS: &[u8] = include_bytes!("weights.bin");

const INPUT_DIM: usize = 42;
const EXPERT_COUNT: usize = 6;
const EXPERT_FC1_OUT: usize = 32;
const EXPERT_FC2_OUT: usize = 16;

const GATE_W_COUNT: usize = INPUT_DIM * EXPERT_COUNT;
const GATE_B_COUNT: usize = EXPERT_COUNT;

const EXPERT_FC1_W_COUNT: usize = INPUT_DIM * EXPERT_FC1_OUT;
const EXPERT_FC1_B_COUNT: usize = EXPERT_FC1_OUT;
const EXPERT_FC2_W_COUNT: usize = EXPERT_FC1_OUT * EXPERT_FC2_OUT;
const EXPERT_FC2_B_COUNT: usize = EXPERT_FC2_OUT;
const EXPERT_FC3_W_COUNT: usize = EXPERT_FC2_OUT;
const EXPERT_FC3_B_COUNT: usize = 1;
const EXPERT_PARAM_COUNT: usize = EXPERT_FC1_W_COUNT
    + EXPERT_FC1_B_COUNT
    + EXPERT_FC2_W_COUNT
    + EXPERT_FC2_B_COUNT
    + EXPERT_FC3_W_COUNT
    + EXPERT_FC3_B_COUNT;

const GATE_W_OFF: usize = 0;
const GATE_B_OFF: usize = GATE_W_OFF + GATE_W_COUNT;
const EXPERTS_OFF: usize = GATE_B_OFF + GATE_B_COUNT;
const TOTAL_F32_COUNT: usize = EXPERTS_OFF + EXPERT_COUNT * EXPERT_PARAM_COUNT;

fn all_weights() -> &'static [f32] {
    static PARSED: std::sync::OnceLock<Box<[f32]>> = std::sync::OnceLock::new();

    PARSED.get_or_init(|| {
        // Model integrity: the size assertion below guarantees the weights file matches
        // the expected architecture. We intentionally do NOT use a SHA-256 checksum
        // because it breaks the retrain→deploy workflow - every retrain produces new
        // weights with a different hash. The size check catches corruption and
        // architecture mismatches without blocking model updates.
        assert_eq!(
            WEIGHTS.len(),
            TOTAL_F32_COUNT * 4,
            "weights.bin size does not match expected f32 count"
        );

        WEIGHTS
            .chunks_exact(4)
            .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
            .collect()
    })
}

fn load_f32_slice(offset: usize, count: usize) -> &'static [f32] {
    &all_weights()[offset..offset + count]
}

/// One expert's three dense layers as `&'static` slices into the parsed weight
/// buffer. Sliced ONCE at first model access (see [`model`]) instead of being
/// re-bounds-checked and re-sliced out of the flat buffer on every candidate.
///
/// `fc1_weight_t`/`fc2_weight_t` are the dense-layer weights in COLUMN-major
/// (transposed) layout — input `k`'s fan-out to every output packed contiguously
/// at `[k*OUT .. (k+1)*OUT]`. The forward pass (`ml_scorer.rs`) runs these layers
/// "output-stationary": for each input `k`, scale that contiguous output row by
/// `input[k]` and accumulate into the `OUT` running sums. The inner loop over
/// outputs is dependency-free (each `acc[o]` is independent), so the compiler
/// auto-vectorizes it across the output lanes — WITHOUT reassociating any single
/// output's reduction. Each `acc[o]` still sums its inputs in `k = 0,1,..,IN-1`
/// order with separate round(mul) then round(add) steps (no FMA fusion), so the
/// result is BIT-IDENTICAL to the row-major scalar dot product, not merely close.
/// (A previous AVX2+FMA attempt fused the multiply-add and reassociated lanes,
/// which was sub-ULP DIFFERENT and regressed ~30 ML-gated contracts; this layout
/// gets the SIMD width with none of that divergence.) `fc3_weight` stays ROW-major
/// (a single output — nothing to vectorize across), as does the 6-output gate.
pub(crate) struct ExpertWeights {
    pub fc1_weight_t: &'static [f32],
    pub fc1_bias: &'static [f32],
    pub fc2_weight_t: &'static [f32],
    pub fc2_bias: &'static [f32],
    pub fc3_weight: &'static [f32],
    pub fc3_bias: f32,
}

/// The full mixture-of-experts model with every weight/bias slice resolved
/// once. The CPU forward pass borrows `&'static MoeModel` a single time per
/// call and indexes these fields directly, so the 37 per-candidate accessor +
/// `OnceLock`-acquire + re-slice calls collapse to one acquire and zero
/// re-slicing. The slices are the SAME bytes the per-layer accessors return,
/// so scores are bit-identical to the pre-hoist path.
pub(crate) struct MoeModel {
    pub gate_weight: &'static [f32],
    pub gate_bias: &'static [f32],
    pub experts: [ExpertWeights; EXPERT_COUNT],
}

/// Transpose a row-major `rows x cols` weight matrix into a `'static`
/// column-major `cols x rows` buffer: element `(o, k)` at row-major
/// `[o*cols + k]` moves to column-major `[k*rows + o]`. Computed exactly once
/// per layer during `model()` init and leaked to `'static` - the model lives
/// for the whole process, so this is a one-time allocation, not a per-scan one.
/// The values are copied verbatim (no arithmetic), so the transposed buffer is
/// bit-identical to the original weights, just reordered for the output-stationary
/// forward kernel in `ml_scorer.rs`.
fn transpose_static(src: &[f32], rows: usize, cols: usize) -> &'static [f32] {
    assert_eq!(src.len(), rows * cols, "transpose dimensions must match");
    let mut out = vec![0.0f32; rows * cols];
    for o in 0..rows {
        for k in 0..cols {
            out[k * rows + o] = src[o * cols + k];
        }
    }
    Box::leak(out.into_boxed_slice())
}

/// Return the process-wide hoisted MoE model. The slices are computed exactly
/// once (behind the same `OnceLock` as the parsed buffer) and shared for the
/// life of the process; every subsequent call is a single atomic-acquire load
/// of an already-initialized pointer.
pub(crate) fn model() -> &'static MoeModel {
    static MODEL: std::sync::OnceLock<MoeModel> = std::sync::OnceLock::new();
    MODEL.get_or_init(|| {
        // Touch the parsed buffer once so the per-layer accessors below resolve
        // against an initialized `&'static [f32]`.
        let _ = all_weights();
        let experts = std::array::from_fn(|expert_idx| ExpertWeights {
            fc1_weight_t: transpose_static(
                expert_fc1_weight(expert_idx),
                EXPERT_FC1_OUT,
                INPUT_DIM,
            ),
            fc1_bias: expert_fc1_bias(expert_idx),
            fc2_weight_t: transpose_static(
                expert_fc2_weight(expert_idx),
                EXPERT_FC2_OUT,
                EXPERT_FC1_OUT,
            ),
            fc2_bias: expert_fc2_bias(expert_idx),
            fc3_weight: expert_fc3_weight(expert_idx),
            fc3_bias: expert_fc3_bias(expert_idx)[0],
        });
        MoeModel {
            gate_weight: gate_weight(),
            gate_bias: gate_bias(),
            experts,
        }
    })
}

/// Return the full flattened weight buffer (used by GPU batch inference).
///
/// Only the GPU MoE backend consumes the flat buffer. The CPU path uses the
/// per-layer accessors below, so leaving this symbol exported in the lean
/// build would surface as dead code. Gating it on `feature = "gpu"` keeps
/// the build clean without an `#[allow(dead_code)]` evasion.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::all_weights_slice;
/// assert!(!all_weights_slice().is_empty());
/// ```
#[cfg(feature = "gpu")]
pub fn all_weights_slice() -> &'static [f32] {
    all_weights()
}

/// Return the gate-layer weight matrix as a flat row-major slice.
/// Return the gate-layer weight matrix.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::gate_weight;
/// assert!(!gate_weight().is_empty());
/// ```
pub fn gate_weight() -> &'static [f32] {
    load_f32_slice(GATE_W_OFF, GATE_W_COUNT)
}

/// Return the gate-layer bias vector.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::gate_bias;
/// assert!(!gate_bias().is_empty());
/// ```
pub fn gate_bias() -> &'static [f32] {
    load_f32_slice(GATE_B_OFF, GATE_B_COUNT)
}

fn expert_base_offset(expert_idx: usize) -> usize {
    assert!(expert_idx < EXPERT_COUNT, "expert index out of range");
    EXPERTS_OFF + expert_idx * EXPERT_PARAM_COUNT
}

/// Return the first dense-layer weights for `expert_idx`.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::expert_fc1_weight;
/// assert!(!expert_fc1_weight(0).is_empty());
/// ```
pub fn expert_fc1_weight(expert_idx: usize) -> &'static [f32] {
    let base = expert_base_offset(expert_idx);
    load_f32_slice(base, EXPERT_FC1_W_COUNT)
}

/// Return the first dense-layer bias vector for `expert_idx`.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::expert_fc1_bias;
/// assert!(!expert_fc1_bias(0).is_empty());
/// ```
pub fn expert_fc1_bias(expert_idx: usize) -> &'static [f32] {
    let base = expert_base_offset(expert_idx) + EXPERT_FC1_W_COUNT;
    load_f32_slice(base, EXPERT_FC1_B_COUNT)
}

/// Return the second dense-layer weights for `expert_idx`.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::expert_fc2_weight;
/// assert!(!expert_fc2_weight(0).is_empty());
/// ```
pub fn expert_fc2_weight(expert_idx: usize) -> &'static [f32] {
    let base = expert_base_offset(expert_idx) + EXPERT_FC1_W_COUNT + EXPERT_FC1_B_COUNT;
    load_f32_slice(base, EXPERT_FC2_W_COUNT)
}

/// Return the second dense-layer bias vector for `expert_idx`.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::expert_fc2_bias;
/// assert!(!expert_fc2_bias(0).is_empty());
/// ```
pub fn expert_fc2_bias(expert_idx: usize) -> &'static [f32] {
    let base = expert_base_offset(expert_idx)
        + EXPERT_FC1_W_COUNT
        + EXPERT_FC1_B_COUNT
        + EXPERT_FC2_W_COUNT;
    load_f32_slice(base, EXPERT_FC2_B_COUNT)
}

/// Return the output-layer weights for `expert_idx`.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::expert_fc3_weight;
/// assert!(!expert_fc3_weight(0).is_empty());
/// ```
pub fn expert_fc3_weight(expert_idx: usize) -> &'static [f32] {
    let base = expert_base_offset(expert_idx)
        + EXPERT_FC1_W_COUNT
        + EXPERT_FC1_B_COUNT
        + EXPERT_FC2_W_COUNT
        + EXPERT_FC2_B_COUNT;
    load_f32_slice(base, EXPERT_FC3_W_COUNT)
}

/// Return the output-layer bias for `expert_idx`.
///
/// # Examples
///
/// ```rust,ignore
/// use keyhog_scanner::ml_weights::expert_fc3_bias;
/// assert!(!expert_fc3_bias(0).is_empty());
/// ```
pub fn expert_fc3_bias(expert_idx: usize) -> &'static [f32] {
    let base = expert_base_offset(expert_idx)
        + EXPERT_FC1_W_COUNT
        + EXPERT_FC1_B_COUNT
        + EXPERT_FC2_W_COUNT
        + EXPERT_FC2_B_COUNT
        + EXPERT_FC3_W_COUNT;
    load_f32_slice(base, EXPERT_FC3_B_COUNT)
}