nornir 0.4.4 - Docs.rs

//! Shared embedder support — model constants, `ModelProfile`, and the
//! tokenize-independent post-processing (mean-pool + L2-normalize) used by
//! **both** the tract (`embed`) and ort (`embed_ort`) backends running the
//! same `jina-embeddings-v2-base-code` ONNX model.
//!
//! Keeping this in one place guarantees the two runtimes produce the *same*
//! `model_profile` and the same vectors for the same text, so embeddings made
//! by one backend are reusable by the other (the warehouse dedup key is
//! backend-agnostic).
//!
//! Compiled when either embed feature is on.

use super::store::ModelProfile;

pub const MODEL_NAME: &str = "jinaai/jina-embeddings-v2-base-code";
pub const DIM: usize = 768;
/// jina v2 supports 8192 tokens; we truncate beyond this.
pub const MAX_TOKENS: usize = 8192;

/// Where `build.rs` placed the weights, and their content hashes.
pub const MODEL_DIR: &str = env!("NORNIR_MODEL_DIR");
pub const WEIGHTS_SHA: &str = env!("NORNIR_MODEL_WEIGHTS_SHA");
pub const TOKENIZER_SHA: &str = env!("NORNIR_MODEL_TOKENIZER_SHA");

/// Resolve the model directory **at runtime** (the baked [`MODEL_DIR`] points at
/// the *building* user's `~/.cache`, which a service user — e.g. `nornir` —
/// can't read; that EACCES is exactly what broke server-side Vector.Search).
/// Precedence:
///   1. `$NORNIR_MODEL_DIR` (runtime override — the service install can set this),
///   2. `/opt/nornir/models` if it holds the model (shared, world-readable,
///      mirrors the `/opt/nornir/cuda` convention),
///   3. the build-time cache (interactive use on the building user's box).
pub fn model_dir() -> std::path::PathBuf {
    if let Ok(d) = std::env::var("NORNIR_MODEL_DIR") {
        if !d.is_empty() {
            return std::path::PathBuf::from(d);
        }
    }
    let opt = std::path::Path::new("/opt/nornir/models");
    if opt.join("tokenizer.json").exists() {
        return opt.to_path_buf();
    }
    std::path::PathBuf::from(MODEL_DIR)
}

/// The model identity shared by both backends. Note `weights_sha` is the SHA
/// of whichever model artifact build.rs fetched (the ONNX export) — the same
/// file feeds tract and ort, so the profile (and thus the dedup key) matches.
pub fn profile() -> ModelProfile {
    ModelProfile {
        model_name: MODEL_NAME.to_string(),
        weights_sha: WEIGHTS_SHA.to_string(),
        tokenizer_sha: TOKENIZER_SHA.to_string(),
        pooling: "mean".to_string(),
        normalize: true,
        dim: DIM,
        dtype: "f32".to_string(),
    }
}

/// Mean-pool a `[1, n_tokens, DIM]` row-major hidden state (flattened, so
/// element `(t, d)` is `hidden[t * DIM + d]`) over tokens, then L2-normalize.
/// Returns the `DIM`-length embedding.
pub fn pool_and_normalize(hidden: &[f32], n_tokens: usize) -> Vec<f32> {
    debug_assert_eq!(hidden.len(), n_tokens * DIM);
    let mut v = vec![0f32; DIM];
    for t in 0..n_tokens {
        let row = &hidden[t * DIM..(t + 1) * DIM];
        for (acc, &x) in v.iter_mut().zip(row) {
            *acc += x;
        }
    }
    let inv = 1.0 / n_tokens.max(1) as f32;
    for x in &mut v {
        *x *= inv;
    }
    l2_normalize(&mut v);
    v
}

/// L2-normalize in place. A zero vector is left unchanged.
pub fn l2_normalize(v: &mut [f32]) {
    let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
    if norm > 0.0 {
        for x in v {
            *x /= norm;
        }
    }
}

/// Truncate raw token ids to `MAX_TOKENS` (and to at least one token), as
/// `i64` for the ONNX `input_ids` tensor, with an all-ones attention mask.
pub fn prepare_tokens(raw: &[u32]) -> (Vec<i64>, Vec<i64>) {
    let n = raw.len().clamp(1, MAX_TOKENS);
    let ids: Vec<i64> = raw[..n].iter().map(|&x| x as i64).collect();
    let mask = vec![1i64; n];
    (ids, mask)
}