ripvec-core 1.0.4

//! In-process reimplementation of the Model2Vec static embedder.
//!
//! Replaces the `model2vec-rs` 0.2 dependency. Reasons:
//!
//! 1. **Parallelism**: `model2vec_rs::StaticModel::encode_with_args` runs
//!    `pool_ids` in a serial inner loop and calls `tokenizers::Tokenizer::encode_batch_fast`
//!    (which spawns its own rayon pool internally). Calling that path
//!    from inside an outer rayon `par_chunks` produced ~60% `__psynch_cvwait`
//!    in our linux-corpus profile — nested rayon scopes parking on each
//!    other. This implementation: tokenize ONCE across the full corpus on
//!    the unfettered thread pool, then mean-pool every encoding in parallel
//!    via a single `par_iter`. No nesting.
//!
//! 2. **ndarray version**: model2vec-rs pinned `ndarray 0.15`; ripvec-core
//!    uses `ndarray 0.17`. The two `Array2<f32>` types are not
//!    interchangeable. Owning the load path here lets us use the workspace
//!    ndarray directly.
//!
//! 3. **Allocator pressure**: model2vec-rs builds intermediate
//!    `Vec<String>` clones inside `encode_with_args`. The local
//!    implementation tokenizes from `&[&str]` references directly.
//!
//! The file format is the published Model2Vec layout (tokenizer.json +
//! model.safetensors + config.json). Local paths only — if Hub download
//! is needed, pre-stage the files via `curl` (see
//! `crates/ripvec-core/tests/ripvec_port_parity.rs` for the recipe).
//!
//! ## Behavioural parity
//!
//! Identical math to `model2vec_rs::StaticModel::encode_with_args`:
//!
//! - Truncate input strings by char count = `max_tokens * median_token_length`
//!   (HF tokenizers can be slow on huge strings).
//! - Tokenize via `tokenizers::Tokenizer::encode_batch_fast`.
//! - Drop UNK tokens.
//! - Truncate token ID list to `max_tokens`.
//! - Pool: for each token, look up the embedding row (optionally remapped
//!   via `token_mapping`), scale by the per-token weight (default 1.0),
//!   accumulate.
//! - Divide by token count; L2-normalize if `normalize` is set.
//!
//! Verified by the integration test
//! `crates/ripvec-core/tests/ripvec_port_parity.rs` which exercises the
//! end-to-end pipeline against `minishlab/potion-code-16M`.

use std::path::Path;

use anyhow::{Context, Result, anyhow};
use ndarray::Array2;
use rayon::prelude::*;
use safetensors::SafeTensors;
use safetensors::tensor::Dtype;
use serde_json::Value;
use tokenizers::Tokenizer;
use wide::f32x8;

/// Default token cap per chunk during embedding. Matches the
/// `model2vec_rs` default; CodeChunks are typically far below this.
pub const DEFAULT_MAX_TOKENS: usize = 512;

/// Tokenize sub-batch size used inside [`StaticEmbedModel::encode_batch`].
///
/// `tokenizers::encode_batch_fast` parallelizes internally via rayon.
/// One giant call across the full corpus dominates wall time in
/// `Encoding` allocation + internal chunk scheduling; 1024 mirrors
/// `model2vec_rs`'s internal default and measured noticeably faster
/// on a 92K-file linux-source corpus.
const BATCH_SIZE: usize = 1024;

/// Loaded Model2Vec static embedder.
///
/// Constructed via [`StaticEmbedModel::from_path`]. Use
/// [`encode_query`](Self::encode_query) for a single text and
/// [`encode_batch`](Self::encode_batch) for many — the batch path is
/// where the parallel-pool win lives.
pub struct StaticEmbedModel {
    tokenizer: Tokenizer,
    /// `(vocab_size, hidden_dim)` row-major embedding table.
    embeddings: Array2<f32>,
    /// Per-token scalar weight (typically present in quantized models).
    /// `None` means use 1.0 for every token.
    weights: Option<Vec<f32>>,
    /// Optional remap from token-id → embedding-row index.
    /// `None` means use the token-id directly.
    token_mapping: Option<Vec<usize>>,
    /// Whether to L2-normalize the pooled output. Read from `config.json`.
    normalize: bool,
    /// Median bytes-per-token across the tokenizer vocab. Used for the
    /// char-level truncation heuristic (avoids pathological tokenization
    /// of multi-MB strings).
    median_token_length: usize,
    /// Token id to drop after tokenization (typically the BPE
    /// `[UNK]`/`<unk>` id). `None` if the tokenizer has no unk token.
    unk_token_id: Option<usize>,
}

impl StaticEmbedModel {
    /// Load from a local directory containing
    /// `tokenizer.json`, `model.safetensors`, and `config.json`.
    ///
    /// `normalize_override` lets callers force-enable or force-disable
    /// L2 normalization regardless of what `config.json` says. Pass
    /// `None` to honor the config.
    pub fn from_path(path: &Path, normalize_override: Option<bool>) -> Result<Self> {
        let tokenizer_path = path.join("tokenizer.json");
        let model_path = path.join("model.safetensors");
        let config_path = path.join("config.json");
        let tokenizer_bytes =
            std::fs::read(&tokenizer_path).context("read tokenizer.json failed")?;
        let model_bytes = std::fs::read(&model_path).context("read model.safetensors failed")?;
        let config_bytes = std::fs::read(&config_path).context("read config.json failed")?;
        Self::from_bytes(
            &tokenizer_bytes,
            &model_bytes,
            &config_bytes,
            normalize_override,
        )
    }

    /// Load from in-memory bytes (e.g., for embedded resources or tests).
    pub fn from_bytes(
        tokenizer_bytes: &[u8],
        model_bytes: &[u8],
        config_bytes: &[u8],
        normalize_override: Option<bool>,
    ) -> Result<Self> {
        let mut tokenizer = Tokenizer::from_bytes(tokenizer_bytes)
            .map_err(|e| anyhow!("tokenizer load failed: {e}"))?;
        // Disable padding/truncation. The published Model2Vec tokenizer
        // configs (e.g. minishlab/potion-code-16M) set
        // `padding.strategy = "BatchLongest"`, which causes
        // `encode_batch_fast` to pad every encoding in a batch up to
        // the longest. On a 250K-item batch this dominates wall time —
        // we measured 33s+ in `Encoding::pad` and 70% cvar parking
        // before disabling. We do our own per-token filtering and
        // length cap inside `pool_ids`/`filter_ids`, so the tokenizer's
        // pad/trunc layer is pure overhead.
        tokenizer.with_padding(None).with_truncation(None).ok();

        let cfg: Value = serde_json::from_slice(config_bytes).context("config.json parse")?;
        let cfg_norm = cfg
            .get("normalize")
            .and_then(Value::as_bool)
            .unwrap_or(true);
        let normalize = normalize_override.unwrap_or(cfg_norm);

        let safet = SafeTensors::deserialize(model_bytes).context("safetensors deserialize")?;

        // The embedding tensor is named "embeddings" in canonical
        // Model2Vec packs, "0" in some sentence-transformers exports,
        // and "embedding.weight" in older variants. Try in that order.
        let embed_tensor = safet
            .tensor("embeddings")
            .or_else(|_| safet.tensor("0"))
            .or_else(|_| safet.tensor("embedding.weight"))
            .map_err(|_| anyhow!("embeddings tensor not found in safetensors"))?;
        let [rows, cols]: [usize; 2] = embed_tensor
            .shape()
            .try_into()
            .map_err(|_| anyhow!("embedding tensor is not 2-D"))?;
        let raw = embed_tensor.data();
        let floats: Vec<f32> = match embed_tensor.dtype() {
            Dtype::F32 => raw
                .chunks_exact(4)
                .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
                .collect(),
            Dtype::F16 => raw
                .chunks_exact(2)
                .map(|b| half::f16::from_le_bytes([b[0], b[1]]).to_f32())
                .collect(),
            Dtype::I8 => raw.iter().map(|&b| f32::from(b.cast_signed())).collect(),
            other => return Err(anyhow!("unsupported embedding dtype: {other:?}")),
        };
        let embeddings = Array2::from_shape_vec((rows, cols), floats)
            .context("embedding matrix shape mismatch")?;

        // Optional "weights" tensor (per-token scales, in some packs).
        let weights = safet.tensor("weights").ok().map(|t| {
            let raw = t.data();
            match t.dtype() {
                Dtype::F64 => raw
                    .chunks_exact(8)
                    .map(|b| {
                        // Per-token weights only need f32 precision; f64
                        // values in published Model2Vec packs are
                        // always small constants well within f32 range.
                        #[expect(
                            clippy::cast_possible_truncation,
                            reason = "weights are bounded; f32 precision is sufficient downstream"
                        )]
                        let v = f64::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]])
                            as f32;
                        v
                    })
                    .collect::<Vec<f32>>(),
                Dtype::F32 => raw
                    .chunks_exact(4)
                    .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
                    .collect::<Vec<f32>>(),
                Dtype::F16 => raw
                    .chunks_exact(2)
                    .map(|b| half::f16::from_le_bytes([b[0], b[1]]).to_f32())
                    .collect::<Vec<f32>>(),
                _ => Vec::new(),
            }
        });

        // Optional "mapping" tensor (token-id → embedding row).
        // Stored as i32 in published packs; values are always
        // non-negative row indices, so the sign loss is intentional.
        let token_mapping = safet.tensor("mapping").ok().map(|t| {
            t.data()
                .chunks_exact(4)
                .map(|b| {
                    #[expect(
                        clippy::cast_sign_loss,
                        reason = "mapping values are non-negative row indices"
                    )]
                    let v = i32::from_le_bytes([b[0], b[1], b[2], b[3]]) as usize;
                    v
                })
                .collect::<Vec<usize>>()
        });

        let (median_token_length, unk_token_id) = compute_metadata(&tokenizer)?;

        Ok(Self {
            tokenizer,
            embeddings,
            weights,
            token_mapping,
            normalize,
            median_token_length,
            unk_token_id,
        })
    }

    /// Embedding dimension.
    #[must_use]
    pub fn hidden_dim(&self) -> usize {
        self.embeddings.ncols()
    }

    /// Encode a single text into a row vector.
    ///
    /// Used at query time. The tokenization step is single-text so the
    /// nested-rayon trap doesn't apply, but it's a separate code path
    /// that avoids the unnecessary `encode_batch_fast` setup.
    pub fn encode_query(&self, text: &str) -> Vec<f32> {
        let truncated = truncate_chars(text, DEFAULT_MAX_TOKENS, self.median_token_length);
        let Ok(encoding) = self.tokenizer.encode_fast(truncated, false) else {
            return vec![0.0; self.hidden_dim()];
        };
        let ids = filter_ids(encoding.get_ids(), self.unk_token_id, DEFAULT_MAX_TOKENS);
        self.pool_ids(&ids)
    }

    /// Encode a batch of texts.
    ///
    /// Iterates over fixed-size sub-batches (`BATCH_SIZE = 1024`), each
    /// tokenized via `encode_batch_fast` (parallel internally inside
    /// tokenizers) and then mean-pooled via `par_iter` on the rayon
    /// pool. Calling one giant `encode_batch_fast` on a 250K-item
    /// corpus dominates wall time in `Encoding` allocation + internal
    /// chunk scheduling; the 1024-batch shape mirrors
    /// `model2vec_rs`'s internal default and measured noticeably
    /// faster on a 92K-file linux-source corpus.
    pub fn encode_batch(&self, texts: &[&str]) -> Vec<Vec<f32>> {
        if texts.is_empty() {
            return Vec::new();
        }
        let mut out: Vec<Vec<f32>> = Vec::with_capacity(texts.len());
        for chunk in texts.chunks(BATCH_SIZE) {
            let truncated: Vec<String> = chunk
                .iter()
                .map(|t| {
                    truncate_chars(t, DEFAULT_MAX_TOKENS, self.median_token_length).to_string()
                })
                .collect();
            let Ok(encodings) = self.tokenizer.encode_batch_fast::<String>(truncated, false) else {
                out.extend(std::iter::repeat_n(
                    vec![0.0; self.hidden_dim()],
                    chunk.len(),
                ));
                continue;
            };
            let pooled: Vec<Vec<f32>> = encodings
                .par_iter()
                .map(|enc| {
                    let ids = filter_ids(enc.get_ids(), self.unk_token_id, DEFAULT_MAX_TOKENS);
                    self.pool_ids(&ids)
                })
                .collect();
            out.extend(pooled);
        }
        out
    }

    /// Mean-pool a list of token ids into one row vector.
    ///
    /// Hot kernel: the inner accumulator runs O(tokens × hidden_dim)
    /// per chunk and was profile-visible at 3.5% self on the linux
    /// corpus (~38s of 104s wall). Hand-vectorized with `wide::f32x8`
    /// (8-lane SIMD: NEON x2 on aarch64, AVX2 on x86_64). For
    /// `potion-code-16M` (hidden_dim = 256), the inner loop is 32
    /// 8-wide adds per token instead of 256 scalar adds — ~4x
    /// reduction in instruction count, with fused multiply-add on
    /// the weighted-token path.
    ///
    /// `pool_ids` itself is serial — parallelism is per-chunk via the
    /// caller's `par_iter`.
    fn pool_ids(&self, ids: &[u32]) -> Vec<f32> {
        let dim = self.hidden_dim();
        let mut sum = vec![0.0_f32; dim];
        let mut count: usize = 0;
        // `as_slice()` returns `Some(&[f32])` for standard-layout
        // arrays. `from_shape_vec` always produces standard layout,
        // so this never returns None for our embedding matrix —
        // expect with a clear panic message in case that ever
        // changes.
        let embeddings_slice = self
            .embeddings
            .as_slice()
            .expect("embedding matrix is non-contiguous; static_model load invariant violated");
        let nrows = self.embeddings.nrows();
        for &id in ids {
            let tok = id as usize;
            let row_idx = self
                .token_mapping
                .as_deref()
                .and_then(|m| m.get(tok).copied())
                .unwrap_or(tok);
            if row_idx >= nrows {
                continue;
            }
            let row_start = row_idx * dim;
            let row = &embeddings_slice[row_start..row_start + dim];
            let scale = self
                .weights
                .as_deref()
                .and_then(|w| w.get(tok).copied())
                .unwrap_or(1.0);
            // Bit-exact comparison against 1.0 is intentional: the
            // weights tensor (when present) stores small constants that
            // are either exactly 1.0 (no scaling, fast path) or genuine
            // per-token scalars. Treating a near-1.0 weight as "skip
            // scaling" would silently bias the embedding.
            #[expect(
                clippy::float_cmp,
                reason = "bit-exact 1.0 check is the intended fast-path gate"
            )]
            let no_scale = scale == 1.0;
            if no_scale {
                accumulate_f32x8(&mut sum, row);
            } else {
                accumulate_scaled_f32x8(&mut sum, row, scale);
            }
            count += 1;
        }
        let denom = count.max(1) as f32;
        scale_in_place_f32x8(&mut sum, 1.0 / denom);
        if self.normalize {
            let norm = l2_norm_f32x8(&sum).max(1e-12);
            scale_in_place_f32x8(&mut sum, 1.0 / norm);
        }
        sum
    }
}

/// Truncate `s` to at most `max_tokens * median_len` chars without
/// splitting a UTF-8 boundary. Matches Model2Vec's pre-tokenization
/// safety cap (BPE on a multi-MB string is pathological).
fn truncate_chars(s: &str, max_tokens: usize, median_len: usize) -> &str {
    s.char_indices()
        .nth(max_tokens.saturating_mul(median_len))
        .map_or(s, |(byte_idx, _)| &s[..byte_idx])
}

// ---------------------------------------------------------------------------
// SIMD pool kernels.
//
// All three helpers process `f32x8` blocks (8 lanes) followed by a scalar
// tail for `len % 8`. f32x8 maps to two NEON `float32x4_t` registers on
// aarch64 and one AVX2 `__m256` register on x86_64; portable via the `wide`
// crate. The weighted accumulator uses `mul_add` which lowers to FMA where
// available (vfmaq_f32 / vfmadd231ps).
//
// For the canonical `potion-code-16M` model (hidden_dim = 256, 8-divisible),
// the scalar tail is never entered.
// ---------------------------------------------------------------------------

/// `acc[i] += row[i]` for `i in 0..acc.len()`, vectorized.
fn accumulate_f32x8(acc: &mut [f32], row: &[f32]) {
    debug_assert_eq!(acc.len(), row.len(), "pool dim mismatch");
    let n = acc.len();
    let body = n - (n % 8);
    let (acc_body, acc_tail) = acc.split_at_mut(body);
    let (row_body, row_tail) = row.split_at(body);
    for (a_chunk, r_chunk) in acc_body.chunks_exact_mut(8).zip(row_body.chunks_exact(8)) {
        let a = f32x8::from(<[f32; 8]>::try_from(&*a_chunk).unwrap());
        let r = f32x8::from(<[f32; 8]>::try_from(r_chunk).unwrap());
        a_chunk.copy_from_slice((a + r).as_array());
    }
    for (a, &r) in acc_tail.iter_mut().zip(row_tail.iter()) {
        *a += r;
    }
}

/// `acc[i] += row[i] * scale` for `i in 0..acc.len()`, vectorized with FMA.
fn accumulate_scaled_f32x8(acc: &mut [f32], row: &[f32], scale: f32) {
    debug_assert_eq!(acc.len(), row.len(), "pool dim mismatch");
    let n = acc.len();
    let body = n - (n % 8);
    let (acc_body, acc_tail) = acc.split_at_mut(body);
    let (row_body, row_tail) = row.split_at(body);
    let scale_v = f32x8::splat(scale);
    for (a_chunk, r_chunk) in acc_body.chunks_exact_mut(8).zip(row_body.chunks_exact(8)) {
        let a = f32x8::from(<[f32; 8]>::try_from(&*a_chunk).unwrap());
        let r = f32x8::from(<[f32; 8]>::try_from(r_chunk).unwrap());
        // mul_add: a + (r * scale_v); lowers to vfmaq_f32 on aarch64.
        a_chunk.copy_from_slice(r.mul_add(scale_v, a).as_array());
    }
    for (a, &r) in acc_tail.iter_mut().zip(row_tail.iter()) {
        *a += r * scale;
    }
}

/// `v[i] *= factor`, vectorized.
fn scale_in_place_f32x8(v: &mut [f32], factor: f32) {
    let n = v.len();
    let body = n - (n % 8);
    let (body_slice, tail) = v.split_at_mut(body);
    let factor_v = f32x8::splat(factor);
    for chunk in body_slice.chunks_exact_mut(8) {
        let x = f32x8::from(<[f32; 8]>::try_from(&*chunk).unwrap());
        chunk.copy_from_slice((x * factor_v).as_array());
    }
    for x in tail.iter_mut() {
        *x *= factor;
    }
}

/// L2 norm of `v`, vectorized.
fn l2_norm_f32x8(v: &[f32]) -> f32 {
    let n = v.len();
    let body = n - (n % 8);
    let (body_slice, tail) = v.split_at(body);
    let mut acc_v = f32x8::splat(0.0);
    for chunk in body_slice.chunks_exact(8) {
        let x = f32x8::from(<[f32; 8]>::try_from(chunk).unwrap());
        acc_v = x.mul_add(x, acc_v);
    }
    let mut sum_sq: f32 = acc_v.as_array().iter().sum();
    for &x in tail {
        sum_sq += x * x;
    }
    sum_sq.sqrt()
}

/// Drop unk tokens (if any) and cap to `max_tokens`. Returns an owned
/// `Vec<u32>` to avoid lifetime-juggling against the encoding object.
fn filter_ids(ids: &[u32], unk_id: Option<usize>, max_tokens: usize) -> Vec<u32> {
    let mut out: Vec<u32> = match unk_id {
        Some(u) => ids.iter().copied().filter(|&i| i as usize != u).collect(),
        None => ids.to_vec(),
    };
    if out.len() > max_tokens {
        out.truncate(max_tokens);
    }
    out
}

/// Compute the tokenizer-derived metadata (median token length + unk id).
fn compute_metadata(tokenizer: &Tokenizer) -> Result<(usize, Option<usize>)> {
    let mut lens: Vec<usize> = tokenizer
        .get_vocab(false)
        .keys()
        .map(std::string::String::len)
        .collect();
    lens.sort_unstable();
    let median_token_length = lens.get(lens.len() / 2).copied().unwrap_or(1);

    let spec: Value =
        serde_json::to_value(tokenizer).context("tokenizer serialize for unk lookup")?;
    let unk_token = spec
        .get("model")
        .and_then(|m| m.get("unk_token"))
        .and_then(Value::as_str);
    let unk_token_id = match unk_token {
        Some(tok) => tokenizer.token_to_id(tok).map(|id| id as usize),
        None => None,
    };
    Ok((median_token_length, unk_token_id))
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `pool_ids` empty input produces a normalized zero-ish vector
    /// (well, 0/0 is masked by `count.max(1)` → divide by 1 → zeros →
    /// L2 norm 0 → `max(1e-12)` → still zeros).
    #[test]
    fn pool_ids_empty_input() {
        // Build a tiny model in-memory to exercise pool_ids without
        // loading a real tokenizer. We construct just enough state.
        // For this test we skip the full file path and assert via a
        // direct math check on a hand-rolled state.
        // (A more complete test would require a real tokenizer asset.)
        let _ = compute_metadata;
        // Compile-time exercise: just ensure this file compiles cleanly.
    }
}