nornir 0.4.24 - Docs.rs

// Embedding-**model** registry — the interchangeable set of models behind the
// [`super::store::Embedder`] trait (plan item G3).
//
// Historically the embedder was hard-wired to `jina-embeddings-v2-base-code`
// (768-dim). This registry makes the *model* a selectable axis: a small table
// of known models, each carrying everything the rest of the system needs —
// the HuggingFace repo + file paths (for `build.rs` to fetch), the output
// dimensionality (so the index + warehouse are dimension-aware), the max
// context, and a stable `profile_id` that namespaces the model in the
// `model_profile` warehouse key.
//
// ## Why this file is `include!`-able with **no deps**
//
// `build.rs` needs the *same* registry the crate uses (so the model it fetches
// is exactly the model the embedder loads). build scripts can't depend on the
// crate's modules, so the registry is written as a self-contained, `std`-only
// table that `build.rs` pulls in via `include!`. Keep it that way: no `use` of
// sibling modules, no external crates — only `&'static str` / `usize` data and
// pure lookups.
//
// ## Selection
//
// [`selected`] resolves the active model from `$NORNIR_EMBED_MODEL` (a model
// `id`, e.g. `jina-v2-base-code`), falling back to [`DEFAULT_MODEL_ID`]. The
// **same** env var is read by `build.rs`, so the fetched weights and the
// loaded model never disagree.
//
// ## Re-embed awareness (IMPORTANT)
//
// Each model has a distinct `profile_id`, which flows into
// [`super::store::ModelProfile`] and thus into the warehouse `model_profile`
// key (a hash over `model_name` + weight/tokenizer SHAs + `dim` + …). **Vectors
// from two different models therefore land under different `model_profile`
// keys and never mix** — a search reconstructs only the snapshot whose
// `model_profile` matches the query embedder. The practical consequence:
// **switching `$NORNIR_EMBED_MODEL` requires a fresh `nornir vector index`** at
// the new model before semantic search returns results; old vectors aren't
// reused (different dim, different semantics) and aren't silently blended in.

/// One interchangeable embedding model. All fields are build-time-stable data;
/// the runtime weight/tokenizer SHAs (which also enter the `model_profile`) are
/// resolved separately by `build.rs` and passed via `rustc-env`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct EmbedModel {
    /// Short, stable selector id (the value of `$NORNIR_EMBED_MODEL`).
    pub id: &'static str,
    /// Fully-qualified model name (HuggingFace `org/name`), recorded in the
    /// `ModelProfile` and used in diagnostics.
    pub model_name: &'static str,
    /// HuggingFace repo to fetch the ONNX export + tokenizer from.
    pub hf_repo: &'static str,
    /// Repo-relative path of the ONNX model file.
    pub onnx_path: &'static str,
    /// Repo-relative path of the tokenizer.
    pub tokenizer_path: &'static str,
    /// Output embedding dimensionality. The index, the warehouse `dim` column,
    /// and the pooling all key off this — it is **not** a global constant.
    pub dim: usize,
    /// Max input tokens (longer inputs are truncated).
    pub max_tokens: usize,
    /// Local cache sub-directory name (under the model cache root). Keeping one
    /// dir per model lets several models coexist in the cache.
    pub cache_subdir: &'static str,
}

/// Default model id. Unchanged behaviour: jina-embeddings-v2-base-code, 768-dim.
pub const DEFAULT_MODEL_ID: &str = "jina-v2-base-code";

/// The known embedding models. The first entry is the default and the only one
/// `build.rs` fetches unless `$NORNIR_EMBED_MODEL` selects another. Additional
/// entries make the registry + selection plumbing real and dimension-aware even
/// before their weights are fetched.
pub const MODELS: &[EmbedModel] = &[
    // ---- default: code-specialised, 768-dim (QK-LayerNorm "code" arch) ----
    EmbedModel {
        id: "jina-v2-base-code",
        model_name: "jinaai/jina-embeddings-v2-base-code",
        hf_repo: "jinaai/jina-embeddings-v2-base-code",
        onnx_path: "onnx/model.onnx",
        tokenizer_path: "tokenizer.json",
        dim: 768,
        max_tokens: 8192,
        cache_subdir: "jina-embeddings-v2-base-code",
    },
    // ---- alt 1: smaller general-text model, 384-dim ----
    // Registry data only by default (not fetched unless selected). Demonstrates
    // a *different dimensionality* flowing through the index + warehouse key.
    EmbedModel {
        id: "minilm-l6-v2",
        model_name: "sentence-transformers/all-MiniLM-L6-v2",
        hf_repo: "sentence-transformers/all-MiniLM-L6-v2",
        onnx_path: "onnx/model.onnx",
        tokenizer_path: "tokenizer.json",
        dim: 384,
        max_tokens: 256,
        cache_subdir: "all-MiniLM-L6-v2",
    },
    // ---- alt 2: larger general-text model, 768-dim ----
    EmbedModel {
        id: "bge-base-en-v1.5",
        model_name: "BAAI/bge-base-en-v1.5",
        hf_repo: "BAAI/bge-base-en-v1.5",
        onnx_path: "onnx/model.onnx",
        tokenizer_path: "tokenizer.json",
        dim: 768,
        max_tokens: 512,
        cache_subdir: "bge-base-en-v1.5",
    },
];

/// Look a model up by its `id`. `None` if no such id is registered.
pub fn by_id(id: &str) -> Option<&'static EmbedModel> {
    MODELS.iter().find(|m| m.id == id)
}

/// The default model (first registry entry). Infallible — the table is
/// non-empty by construction.
pub fn default_model() -> &'static EmbedModel {
    &MODELS[0]
}

/// Resolve the env-selected model id (trimmed, lower-cased). Returns the raw
/// value so callers can produce a precise error for an unknown id. `None` ⇒ use
/// the default.
pub fn selected_id_from_env() -> Option<String> {
    match std::env::var("NORNIR_EMBED_MODEL") {
        Ok(v) if !v.trim().is_empty() => Some(v.trim().to_ascii_lowercase()),
        _ => None,
    }
}

/// The active model: `$NORNIR_EMBED_MODEL` (by `id`) or the default. Errors with
/// the list of known ids if the env names an unknown model.
pub fn selected() -> Result<&'static EmbedModel, String> {
    match selected_id_from_env() {
        None => Ok(default_model()),
        Some(id) => by_id(&id).ok_or_else(|| {
            let known: Vec<&str> = MODELS.iter().map(|m| m.id).collect();
            format!(
                "NORNIR_EMBED_MODEL=`{id}` is not a known embedding model; \
                 known ids: {}",
                known.join(", ")
            )
        }),
    }
}