tokitai-operator 0.1.0

//! Architecture descriptor.
//!
//! The descriptor records the depth, hidden dim, expert count,
//! top-k, and the per-expert MLP shape. Serialized as
//! `arch.json`; loaded by `src/infer.rs` to reconstruct the
//! `MoEModel` for inference.
//!
// `ModelArch` — a serializable description of the model's
// architecture (NOT its trained weights). Pair this with a checkpoint
// of values to fully reconstruct a trained model.
//
// Fields are stored as a flat struct (no nested types) so the JSON
// is trivial to inspect and to merge with config blobs in the
// tokitai-search integration.
//
// The `seed` field captures the seed used to initialize the model —
// `build` re-runs the same deterministic initializer, so the resulting
// fresh model has bit-for-bit identical initial weights as a model
// built with the same arch from scratch. NOTE: `from_moe_model` cannot
// recover the original training seed (the `MoEModel` does not store
// it), so it defaults `seed` to 0. This is fine for the common case
// of saving the arch once at training start, then loading it later
// to build a fresh model with the same architecture.
//
// `ModelKind` and `RouterKind` were added in the 3-way ablation
// refactor (sheaf+padic vs softmax-only vs dense). They are placed
// at the END of the struct so that arch.json files written by
// earlier versions (without these fields) deserialise cleanly via
// the `#[serde(default)]` attribute — see [`Default`] for
// `ModelKind` / `RouterKind` below. The historical
// sheaf+padic / MoE combination is the default, so old arch.json
// files describe the same architecture they always did once the
// new fields are filled in with their defaults.

use serde::{Deserialize, Serialize};

use crate::moe_model::{MoEModel, MoESize};
use crate::{Error, Result};

pub use super::dense::{DenseConfig, DenseModel, QualityModel};

/// Which model family this arch describes.
///
/// The two families are the MoE (1 router + N_EXPERTS experts) and
/// the Dense MLP stack (no router, no per-expert dispatch). The
/// Dense family exists for the 3-way ablation study that compares
/// MoE-with-sheaf+padic, MoE-with-softmax-only, and a Dense MLP
/// at matched parameter budget. See
/// `docs/tokitai-search/docs/COMPETITIVE_BENCHMARK_PLAN.md` for
/// the experimental design; this enum is the on/off switch for
/// each branch.
///
/// New variants should be added at the END of this enum to keep
/// `#[serde]` deserialization stable for arch.json files written
/// by earlier versions.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ModelKind {
    /// 1 router + N_EXPERTS experts (default; historical behavior).
    MoE,
    /// A single stack of FFN sub-blocks with no routing machinery
    /// and no MoE dispatch. Used as the "matched parameter budget"
    /// baseline in the ablation study.
    Dense,
}

impl Default for ModelKind {
    fn default() -> Self {
        // MoE is the historical default — the original arch.json
        // files (which lack an explicit `model_kind` field) must
        // deserialize to the same architecture they always did.
        ModelKind::MoE
    }
}

/// Which router kernel the MoE model uses.
///
/// The 0.7B MoE literature supports three router variants; the
/// ablation study compares all three against a Dense baseline at
/// matched parameter budget:
///
/// - `SheafPadic`: the historical MoE router. Router output is
///   the post-mask softmax probabilities, but the router is
///   additionally constrained by the sheaf overlap-check (a
///   structural prior on which experts are allowed to be active
///   together) and the p-adic encode/decode (a precision prior
///   on the router weights). This is the variant in
///   `docs/PATENT_DISCLOSURE.md`.
/// - `SheafOnly`: sheaf overlap-check is on, p-adic encode/decode
///   is off. Useful for disentangling the contribution of each
///   structural prior in the ablation.
/// - `SoftmaxOnly`: pure post-mask softmax router. Both sheaf
///   and p-adic constraints are disabled (`overlap-check penalty
///   coefficient = 0`, no p-adic encode/decode). This is the
///   "no structural prior" baseline against which the sheaf+padic
///   and sheaf-only variants are compared.
///
/// For `ModelKind::Dense` the `router_kind` field is recorded in
/// the arch for reproducibility but is semantically unused (Dense
/// models have no router). The default value is `SheafPadic` for
/// historical arch.json compatibility.
///
/// New variants should be added at the END of this enum to keep
/// `#[serde]` deserialization stable for arch.json files written
/// by earlier versions.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum RouterKind {
    /// Sheaf overlap-check + p-adic encode/decode (default;
    /// historical behavior).
    SheafPadic,
    /// Sheaf overlap-check only; p-adic encode/decode disabled.
    SheafOnly,
    /// Pure post-mask softmax router. Both sheaf and p-adic
    /// constraints are disabled.
    SoftmaxOnly,
}

impl Default for RouterKind {
    fn default() -> Self {
        // SheafPadic is the historical default — the original
        // arch.json files (which lack an explicit `router_kind`
        // field) must deserialize to the same architecture they
        // always did.
        RouterKind::SheafPadic
    }
}

/// A self-describing snapshot of the model architecture. The
/// `input_dim` / `hidden_dim` / `output_dim` / `n_experts` / `top_k`
/// / `expert_depth` fields are redundant with `size` (they're
/// derivable from `size.dims()` and the topology constants), but
/// we keep them explicit so that:
///
///   1. A loaded arch is fully self-describing without needing to
///      recompile against the same `topology` constants.
///   2. `build` can reject malformed arch files where the explicit
///      dims don't agree with the size variant.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ModelArch {
    /// Size variant (Tiny / Medium / Full).
    pub size: MoESize,
    /// Raw input feature dim (default: 96).
    pub input_dim: usize,
    /// Hidden dim of each expert's MLP body (matches `size.dims().0`).
    pub hidden_dim: usize,
    /// Output dim of each expert's head (matches the topology's
    /// `OUT_DIM`, default: 20).
    pub output_dim: usize,
    /// Number of experts in the MoE (topology `N_EXPERTS`, default: 4).
    pub n_experts: usize,
    /// Top-K routing fan-out (topology `TOP_K`, default: 2).
    pub top_k: usize,
    /// Number of hidden MLP blocks per expert (matches
    /// `size.dims().1`). We call this "depth" to mirror the
    /// `docs/MOE_DESIGN.md` vocabulary.
    pub expert_depth: usize,
    /// RNG seed used to initialize parameters in [`ModelArch::build`].
    /// `from_moe_model` sets this to 0 because the live `MoEModel`
    /// does not retain its construction seed.
    pub seed: u64,
    /// Model family selector. New arch.json files (post-ablation
    /// refactor) write this field; older files deserialize to
    /// `ModelKind::MoE` via the `#[serde(default)]` attribute on
    /// the `#[serde(default)]`-aware enum. See [`ModelKind`].
    #[serde(default)]
    pub model_kind: ModelKind,
    /// Router kernel selector. New arch.json files write this
    /// field; older files deserialize to `RouterKind::SheafPadic`
    /// via the `#[serde(default)]` attribute. See [`RouterKind`].
    /// Unused for `ModelKind::Dense` (recorded for reproducibility
    /// only).
    #[serde(default)]
    pub router_kind: RouterKind,
}

impl Default for ModelArch {
    /// Default arch: `MoESize::Tiny` with seed 0 and the historical
    /// `ModelKind::MoE` + `RouterKind::SheafPadic` defaults. This
    /// matches what `from_size(MoESize::Tiny, 0)` returned in the
    /// pre-ablation codebase, so the default is a strict superset
    /// of the old behavior.
    fn default() -> Self {
        Self::from_size(MoESize::Tiny, 0)
    }
}

impl ModelArch {
    /// Extract the arch from a live `MoEModel`. The returned `seed`
    /// is always 0 (the live model does not store the seed it was
    /// built with); if you need bit-for-bit reproducibility of the
    /// initial weights, store the original seed out-of-band and
    /// overwrite the `seed` field on the returned arch.
    ///
    /// The new `model_kind` / `router_kind` fields are set to their
    /// historical defaults (`MoE` / `SheafPadic`) because the live
    /// `MoEModel` does not record which kernel it was built with.
    /// If the live model was actually built with `RouterKind::SoftmaxOnly`
    /// (e.g. via the `tiny_softmax_moe` factory), the caller must
    /// overwrite the `router_kind` field on the returned arch.
    pub fn from_moe_model(model: &MoEModel) -> Self {
        let (hidden, depth) = model.size.dims();
        Self {
            size: model.size,
            input_dim: crate::moe_model::topology::IN_DIM,
            hidden_dim: hidden,
            output_dim: crate::moe_model::topology::OUT_DIM,
            n_experts: crate::moe_model::topology::N_EXPERTS,
            top_k: crate::moe_model::topology::TOP_K,
            expert_depth: depth,
            seed: 0,
            model_kind: ModelKind::MoE,
            router_kind: RouterKind::SheafPadic,
        }
    }

    /// Build a `ModelArch` directly from a `MoESize` + seed, without
    /// needing to construct a live `MoEModel`. This is the right
    /// constructor for code paths that want to *describe* the
    /// architecture (e.g., writing `arch.json` in a dry-run training
    /// loop, or for the `Full` size where actually building the
    /// model would allocate ~10GB of AdamW state). The returned
    /// arch is otherwise identical to `from_moe_model` for the same
    /// `size`.
    pub fn from_size(size: MoESize, seed: u64) -> Self {
        let (hidden, depth) = size.dims();
        Self {
            size,
            input_dim: crate::moe_model::topology::IN_DIM,
            hidden_dim: hidden,
            output_dim: crate::moe_model::topology::OUT_DIM,
            n_experts: crate::moe_model::topology::N_EXPERTS,
            top_k: crate::moe_model::topology::TOP_K,
            expert_depth: depth,
            seed,
            model_kind: ModelKind::MoE,
            router_kind: RouterKind::SheafPadic,
        }
    }

    /// Build a `ModelArch` for a `DenseModel` from a `DenseConfig`
    /// and a seed. The MoE-specific fields (size, hidden_dim,
    /// n_experts, top_k, expert_depth) are filled in to *match* the
    /// Dense dims as closely as possible so the arch.json schema
    /// stays uniform (every arch.json has the same field set, even
    /// if some fields are unused for Dense). The `size` field is
    /// set to `MoESize::Tiny` as a stable placeholder — it is NOT
    /// consulted by the Dense build path.
    pub fn from_dense(cfg: &DenseConfig, seed: u64) -> Self {
        Self {
            // Placeholder; the Dense build path ignores `size`.
            // The actual hidden/intermediate/output dims live in
            // the arch's explicit `hidden_dim` / `output_dim` /
            // (n_experts=0) fields, which the Dense build reads.
            size: MoESize::Tiny,
            input_dim: cfg.input_dim,
            // For Dense the "hidden_dim" we record is the residual
            // stream width (i.e. cfg.hidden_dim), even though
            // FFN sub-blocks also have a wider intermediate.
            hidden_dim: cfg.hidden_dim,
            output_dim: cfg.output_dim,
            // 0 experts: a sentinel meaning "no MoE dispatch".
            n_experts: 0,
            // 0 top_k: sentinel (no router).
            top_k: 0,
            // For Dense the "expert_depth" we record is the FFN
            // sub-block count. The Dense build path uses this as
            // the number of sub-blocks in the body.
            expert_depth: cfg.n_blocks,
            seed,
            model_kind: ModelKind::Dense,
            // router_kind is recorded for reproducibility even
            // though Dense models don't use it. The default
            // SheafPadic is the historical fallback.
            router_kind: RouterKind::SheafPadic,
        }
    }

    /// Reconstruct a fresh `MoEModel` from this arch. The new model
    /// has the documented architecture and the same initial weights
    /// that a direct `MoEModel::new(size, seed)` call would produce
    /// — i.e., a freshly seeded model, NOT the trained weights.
    /// Trained weights come from a separate checkpoint file.
    ///
    /// Returns an error if the explicit dim fields disagree with
    /// `size.dims()` or the topology constants (catches typos in
    /// hand-edited arch JSON), or if the arch describes a Dense
    /// model (use [`ModelArch::build_quality_model`] for that).
    pub fn build(&self) -> Result<MoEModel> {
        if self.model_kind == ModelKind::Dense {
            return Err(Error::backend(format!(
                "arch: build() refuses ModelKind::Dense; use build_quality_model() \
                 to construct the matching DenseModel (size={:?}, \
                 model_kind={:?}, router_kind={:?})",
                self.size, self.model_kind, self.router_kind
            )));
        }
        let (hidden, depth) = self.size.dims();
        if self.hidden_dim != hidden {
            return Err(Error::backend(format!(
                "arch: hidden_dim {} does not match size {:?} dims {}",
                self.hidden_dim, self.size, hidden
            )));
        }
        if self.expert_depth != depth {
            return Err(Error::backend(format!(
                "arch: expert_depth {} does not match size {:?} dims {}",
                self.expert_depth, self.size, depth
            )));
        }
        if self.input_dim != crate::moe_model::topology::IN_DIM {
            return Err(Error::backend(format!(
                "arch: input_dim {} does not match topology IN_DIM {}",
                self.input_dim,
                crate::moe_model::topology::IN_DIM
            )));
        }
        if self.output_dim != crate::moe_model::topology::OUT_DIM {
            return Err(Error::backend(format!(
                "arch: output_dim {} does not match topology OUT_DIM {}",
                self.output_dim,
                crate::moe_model::topology::OUT_DIM
            )));
        }
        if self.n_experts != crate::moe_model::topology::N_EXPERTS {
            return Err(Error::backend(format!(
                "arch: n_experts {} does not match topology N_EXPERTS {}",
                self.n_experts,
                crate::moe_model::topology::N_EXPERTS
            )));
        }
        if self.top_k != crate::moe_model::topology::TOP_K {
            return Err(Error::backend(format!(
                "arch: top_k {} does not match topology TOP_K {}",
                self.top_k,
                crate::moe_model::topology::TOP_K
            )));
        }
        // Note: the `router_kind` field is *recorded* in the arch
        // for reproducibility, but the existing `MoEModel::new`
        // path always uses the sheaf+padic-flavoured router
        // topology. The SoftmaxOnly variant is forward-looking:
        // once the router's forward logic can be parameterised
        // on `RouterKind`, this `build` will dispatch on
        // `self.router_kind` to pick the right kernel. For now
        // the field is logged-and-ignored (still a valid arch).
        Ok(MoEModel::new(self.size, self.seed))
    }

    /// Reconstruct a fresh `QualityModel` (enum wrapping
    /// `MoEModel` or `DenseModel`) from this arch. This is the
    /// right constructor for callers that need to support both
    /// model families; the simpler [`ModelArch::build`] only
    /// supports the MoE family.
    ///
    /// For `ModelKind::MoE` this delegates to [`ModelArch::build`]
    /// and wraps the result in `QualityModel::MoE`. The
    /// `router_kind` field is recorded for reproducibility but
    /// the live `MoEModel` is constructed with the historical
    /// router (see the note in [`ModelArch::build`]).
    ///
    /// For `ModelKind::Dense` the dense dims are reconstructed
    /// from the explicit arch fields. The `n_experts == 0` and
    /// `top_k == 0` placeholders are recognized as "Dense arch"
    /// and the actual dense body width / depth are read from
    /// `hidden_dim` / `expert_depth` respectively. The
    /// `intermediate` dim is fixed at 4096 (the canonical
    /// `tiny_dense` value); arch.json files describing a Dense
    /// model with a non-4096 intermediate are rejected here
    /// because the field isn't yet part of the on-disk schema
    /// (a follow-up will widen the schema to record it).
    pub fn build_quality_model(&self) -> Result<QualityModel> {
        match self.model_kind {
            ModelKind::MoE => Ok(QualityModel::MoE(self.build()?)),
            ModelKind::Dense => {
                if self.n_experts != 0 {
                    return Err(Error::backend(format!(
                        "arch: ModelKind::Dense requires n_experts=0, got {}",
                        self.n_experts
                    )));
                }
                if self.top_k != 0 {
                    return Err(Error::backend(format!(
                        "arch: ModelKind::Dense requires top_k=0, got {}",
                        self.top_k
                    )));
                }
                let cfg = DenseConfig {
                    input_dim: self.input_dim,
                    hidden_dim: self.hidden_dim,
                    // 4096 is the canonical `tiny_dense`
                    // intermediate. The arch.json schema does
                    // not yet carry an explicit field for it
                    // (a follow-up will widen the schema), so
                    // the on-disk value is fixed by the
                    // `tiny_dense` factory.
                    intermediate: 4096,
                    output_dim: self.output_dim,
                    n_blocks: self.expert_depth,
                };
                Ok(QualityModel::Dense(DenseModel::new(cfg, self.seed)))
            }
        }
    }
}