ripvec-core 0.13.12

//! CPU embedding backend using ndarray + system BLAS.
//!
//! Implements BERT inference on CPU using [`ndarray`] with system BLAS for
//! matrix operations. Weights are loaded directly from safetensors files
//! downloaded via `hf-hub`.
//!
//! Supports the `ClassicBert` family (BGE models): learned position embeddings,
//! GELU activation, and QKV projections with bias.

// Ensure the BLAS linker symbols are pulled in.
#[cfg(target_os = "macos")]
extern crate accelerate_src;
extern crate blas_src;
#[cfg(not(target_os = "macos"))]
extern crate openblas_src;

use std::f32::consts::PI;
use std::sync::Arc;

use hf_hub::api::sync::Api;
use ndarray::{Array1, Array2, ArrayView1, Axis, s};
use safetensors::SafeTensors;

use super::{DeviceHint, EmbedBackend, Encoding};

// ---------------------------------------------------------------------------
// Architecture validation
// ---------------------------------------------------------------------------

/// Validate that the loaded weights are a recognized `ClassicBert` model.
///
/// `ClassicBert` has `embeddings.position_embeddings.weight`. Returns an
/// error if the architecture is not recognized.
fn detect_variant(tensors: &SafeTensors<'_>) -> crate::Result<()> {
    if tensors
        .tensor("embeddings.position_embeddings.weight")
        .is_ok()
    {
        Ok(())
    } else {
        Err(crate::Error::Other(anyhow::anyhow!(
            "unrecognized model architecture: no position_embeddings found"
        )))
    }
}

// ---------------------------------------------------------------------------
// BERT model configuration
// ---------------------------------------------------------------------------

/// Configuration for a BERT-style encoder model.
///
/// Matches the `config.json` schema from `HuggingFace` model repos.
#[derive(Debug, Clone)]
struct BertConfig {
    /// Hidden dimension (e.g. 384 for bge-small).
    hidden_size: i32,
    /// Number of transformer layers.
    num_hidden_layers: i32,
    /// Number of attention heads.
    num_attention_heads: i32,
    /// Maximum sequence length (512 for ClassicBert).
    max_position_embeddings: i32,
    /// Layer norm epsilon.
    layer_norm_eps: f32,
}

impl BertConfig {
    /// Parse from a `config.json` value.
    #[expect(
        clippy::cast_possible_truncation,
        reason = "config values are small ints/floats that fit in i32/f32"
    )]
    fn from_json(v: &serde_json::Value) -> crate::Result<Self> {
        let get_i32 = |key: &str| -> crate::Result<i32> {
            v.get(key)
                .and_then(serde_json::Value::as_i64)
                .map(|n| n as i32)
                .ok_or_else(|| crate::Error::Other(anyhow::anyhow!("missing config key: {key}")))
        };
        let get_f64 = |key: &str| -> crate::Result<f64> {
            v.get(key)
                .and_then(serde_json::Value::as_f64)
                .ok_or_else(|| crate::Error::Other(anyhow::anyhow!("missing config key: {key}")))
        };

        let layer_norm_eps =
            get_f64("layer_norm_epsilon").or_else(|_| get_f64("layer_norm_eps"))? as f32;

        Ok(Self {
            hidden_size: get_i32("hidden_size")?,
            num_hidden_layers: get_i32("num_hidden_layers")?,
            num_attention_heads: get_i32("num_attention_heads")?,
            max_position_embeddings: get_i32("max_position_embeddings").unwrap_or(512),
            layer_norm_eps,
        })
    }
}

// ---------------------------------------------------------------------------
// Safetensors -> ndarray helpers
// ---------------------------------------------------------------------------

/// Load a named tensor from safetensors as `Array2<f32>`.
///
/// The tensor must be stored in `f32` (little-endian) format. Returns an
/// error if the tensor is missing or the byte count does not match the shape.
fn load_tensor2(tensors: &SafeTensors<'_>, name: &str) -> crate::Result<Array2<f32>> {
    let tensor = tensors
        .tensor(name)
        .map_err(|_| crate::Error::Other(anyhow::anyhow!("missing weight: {name}")))?;
    let shape = tensor.shape();
    if shape.len() != 2 {
        return Err(crate::Error::Other(anyhow::anyhow!(
            "expected 2D tensor for {name}, got {}D",
            shape.len()
        )));
    }
    let data: Vec<f32> = tensor
        .data()
        .chunks_exact(4)
        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
        .collect();
    Array2::from_shape_vec((shape[0], shape[1]), data)
        .map_err(|e| crate::Error::Other(anyhow::anyhow!("shape error for {name}: {e}")))
}

/// Load a named tensor from safetensors as `Array1<f32>`.
///
/// The tensor must be stored in `f32` (little-endian) format.
fn load_tensor1(tensors: &SafeTensors<'_>, name: &str) -> crate::Result<Array1<f32>> {
    let tensor = tensors
        .tensor(name)
        .map_err(|_| crate::Error::Other(anyhow::anyhow!("missing weight: {name}")))?;
    let shape = tensor.shape();
    if shape.len() != 1 {
        return Err(crate::Error::Other(anyhow::anyhow!(
            "expected 1D tensor for {name}, got {}D",
            shape.len()
        )));
    }
    let data: Vec<f32> = tensor
        .data()
        .chunks_exact(4)
        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
        .collect();
    Ok(Array1::from_vec(data))
}

/// Optionally load a 1D tensor -- returns `None` if missing.
fn try_load_tensor1(tensors: &SafeTensors<'_>, name: &str) -> crate::Result<Option<Array1<f32>>> {
    if tensors.tensor(name).is_ok() {
        Ok(Some(load_tensor1(tensors, name)?))
    } else {
        Ok(None)
    }
}

// ---------------------------------------------------------------------------
// Layer norm helper
// ---------------------------------------------------------------------------

/// Apply layer normalization to a 1D hidden vector (single token).
///
/// Computes: `(x - mean) / sqrt(var + eps) * weight + bias`
fn layer_norm(
    x: &ArrayView1<'_, f32>,
    weight: &Array1<f32>,
    bias: &Array1<f32>,
    eps: f32,
) -> Array1<f32> {
    let mean = x.mean().unwrap_or(0.0);
    let var = x.mapv(|v| (v - mean).powi(2)).mean().unwrap_or(0.0);
    let inv_std = 1.0 / (var + eps).sqrt();
    (x.mapv(|v| (v - mean) * inv_std) * weight) + bias
}

// ---------------------------------------------------------------------------
// Activation helpers
// ---------------------------------------------------------------------------

/// GELU activation (tanh approximation).
///
/// `x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))`
fn gelu(x: f32) -> f32 {
    x * 0.5 * (1.0 + ((2.0 / PI).sqrt() * (x + 0.044_715 * x.powi(3))).tanh())
}

/// Softmax along the last axis of a 1D slice (in-place).
///
/// Uses the numerically stable `exp(x - max) / sum(exp(x - max))` form.
fn softmax_inplace(vals: &mut [f32]) {
    let max = vals.iter().copied().fold(f32::NEG_INFINITY, f32::max);
    let mut sum = 0.0_f32;
    for v in vals.iter_mut() {
        *v = (*v - max).exp();
        sum += *v;
    }
    let inv_sum = 1.0 / sum;
    for v in vals.iter_mut() {
        *v *= inv_sum;
    }
}

// ---------------------------------------------------------------------------
// BERT embeddings layer
// ---------------------------------------------------------------------------

/// BERT embeddings layer: word + position + `token_type` + `LayerNorm`.
///
/// Position and `token_type` embeddings are summed with word embeddings.
#[derive(Debug)]
struct CpuBertEmbeddings {
    /// Word embedding table `[vocab_size, hidden]`.
    word_embeddings: Array2<f32>,
    /// Learned position embeddings (`ClassicBert` only) `[max_seq, hidden]`.
    position_embeddings: Option<Array2<f32>>,
    /// Token type embeddings (`ClassicBert` only) `[2, hidden]`.
    token_type_embeddings: Option<Array2<f32>>,
    /// Layer norm weight `[hidden]`.
    layer_norm_weight: Array1<f32>,
    /// Layer norm bias `[hidden]`.
    layer_norm_bias: Array1<f32>,
    /// Layer norm epsilon.
    layer_norm_eps: f32,
}

impl CpuBertEmbeddings {
    /// Forward pass: look up embeddings, sum, and normalize.
    ///
    /// Returns one `[seq, hidden]` matrix per batch item.
    #[expect(
        clippy::cast_sign_loss,
        clippy::cast_possible_truncation,
        reason = "token IDs from tokenizer are always non-negative and fit in usize"
    )]
    fn forward(&self, encodings: &[Encoding]) -> Vec<Array2<f32>> {
        let hidden = self.word_embeddings.shape()[1];

        encodings
            .iter()
            .map(|enc| {
                let seq_len = enc.input_ids.len();
                let mut output = Array2::<f32>::zeros((seq_len, hidden));

                for (t, &id) in enc.input_ids.iter().enumerate() {
                    let word_row = self.word_embeddings.row(id as usize);
                    output.row_mut(t).assign(&word_row);

                    if let Some(ref pos_emb) = self.position_embeddings {
                        let pos_row = pos_emb.row(t);
                        output.row_mut(t).zip_mut_with(&pos_row, |o, &p| *o += p);
                    }

                    if let Some(ref tok_emb) = self.token_type_embeddings {
                        let type_id = enc.token_type_ids[t] as usize;
                        let tok_row = tok_emb.row(type_id);
                        output.row_mut(t).zip_mut_with(&tok_row, |o, &p| *o += p);
                    }

                    let normed = layer_norm(
                        &output.row(t),
                        &self.layer_norm_weight,
                        &self.layer_norm_bias,
                        self.layer_norm_eps,
                    );
                    output.row_mut(t).assign(&normed);
                }

                output
            })
            .collect()
    }
}

// ---------------------------------------------------------------------------
// Self-attention
// ---------------------------------------------------------------------------

/// Self-attention sub-layer within a BERT encoder layer.
///
/// Uses a fused QKV projection: a single `[3*hidden, hidden]` weight matrix
/// produces Q, K, V in one matmul, then splits the result.
/// Projections include bias terms; no rotary encoding.
#[derive(Debug)]
struct CpuBertSelfAttention {
    /// Fused Q/K/V weight matrix `[3*hidden, hidden]`.
    qkv_weight: Array2<f32>,
    /// Fused Q/K/V bias `[3*hidden]`.
    qkv_bias: Option<Array1<f32>>,
    /// Output projection weight `[hidden, hidden]`.
    output_weight: Array2<f32>,
    /// Output projection bias `[hidden]`.
    output_bias: Option<Array1<f32>>,
    /// Post-attention `LayerNorm` weight `[hidden]`.
    output_ln_weight: Array1<f32>,
    /// Post-attention `LayerNorm` bias `[hidden]`.
    output_ln_bias: Array1<f32>,
    /// Number of attention heads.
    num_heads: i32,
    /// Dimension per head (`hidden / num_heads`).
    head_dim: i32,
    /// Layer norm epsilon.
    layer_norm_eps: f32,
}

impl CpuBertSelfAttention {
    /// Scaled dot-product multi-head attention with residual + `LayerNorm`.
    ///
    /// Both variants use post-norm: attention -> residual -> `LayerNorm`.
    /// `hidden` is `[seq, hidden]` (single batch item). `mask` is `[seq]`
    /// with 0.0 for real tokens and a large negative value for padding.
    #[expect(
        clippy::cast_sign_loss,
        clippy::cast_precision_loss,
        reason = "num_heads/head_dim are small positive ints from config"
    )]
    fn forward(&self, hidden: &Array2<f32>, mask: &Array1<f32>) -> crate::Result<Array2<f32>> {
        let seq = hidden.shape()[0];
        let nh = self.num_heads as usize;
        let hd = self.head_dim as usize;
        let hidden_dim = nh * hd;

        // --- Fused QKV projection: [seq, hidden] @ [hidden, 3*hidden] => [seq, 3*hidden] ---
        let qkv = hidden.dot(&self.qkv_weight.t());
        let qkv = if let Some(ref bias) = self.qkv_bias {
            qkv + bias
        } else {
            qkv
        };

        // Split into Q, K, V each [seq, hidden]
        let q = qkv.slice(s![.., 0..hidden_dim]).to_owned();
        let k = qkv.slice(s![.., hidden_dim..2 * hidden_dim]).to_owned();
        let v = qkv.slice(s![.., 2 * hidden_dim..3 * hidden_dim]).to_owned();

        // --- Per-head attention ---
        // We iterate over heads since ndarray doesn't have native 4D matmul.
        let mut context = Array2::<f32>::zeros((seq, hidden_dim));

        for h in 0..nh {
            let col_start = h * hd;
            let col_end = col_start + hd;

            // Extract head slices: [seq, head_dim]
            let q_h = q.slice(s![.., col_start..col_end]);
            let k_h = k.slice(s![.., col_start..col_end]);
            let v_h = v.slice(s![.., col_start..col_end]);

            // scores = Q @ K^T / sqrt(head_dim)  => [seq, seq]
            let scale = 1.0 / (hd as f32).sqrt();
            let mut scores = q_h.dot(&k_h.t());
            scores.mapv_inplace(|v| v * scale);

            // Add attention mask: broadcast [seq] to each row
            for mut row in scores.rows_mut() {
                row.zip_mut_with(mask, |s, &m| *s += m);
            }

            // Softmax along last axis (each row)
            for mut row in scores.rows_mut() {
                softmax_inplace(row.as_slice_mut().ok_or_else(|| {
                    crate::Error::Other(anyhow::anyhow!("attention scores not contiguous"))
                })?);
            }

            // context_h = scores @ V_h  => [seq, head_dim]
            let ctx_h = scores.dot(&v_h);
            context.slice_mut(s![.., col_start..col_end]).assign(&ctx_h);
        }

        // --- Output projection ---
        let projected = context.dot(&self.output_weight.t());
        let projected = if let Some(ref bias) = self.output_bias {
            projected + bias
        } else {
            projected
        };

        // --- Residual + LayerNorm ---
        let residual = hidden + &projected;
        let mut output = Array2::<f32>::zeros((seq, hidden_dim));
        for t in 0..seq {
            let normed = layer_norm(
                &residual.row(t),
                &self.output_ln_weight,
                &self.output_ln_bias,
                self.layer_norm_eps,
            );
            output.row_mut(t).assign(&normed);
        }

        Ok(output)
    }
}

// ---------------------------------------------------------------------------
// Feed-forward network
// ---------------------------------------------------------------------------

/// Feed-forward network sub-layer within a BERT encoder layer.
///
/// Linear -> GELU -> Linear, all with bias terms.
#[derive(Debug)]
struct CpuBertFfn {
    /// Intermediate projection weight `[intermediate, hidden]`.
    intermediate_weight: Array2<f32>,
    /// Intermediate projection bias `[intermediate]`.
    intermediate_bias: Option<Array1<f32>>,
    /// Output projection weight `[hidden, intermediate]`.
    output_weight: Array2<f32>,
    /// Output projection bias `[hidden]`.
    output_bias: Option<Array1<f32>>,
    /// Post-FFN `LayerNorm` weight `[hidden]`.
    output_ln_weight: Array1<f32>,
    /// Post-FFN `LayerNorm` bias `[hidden]`.
    output_ln_bias: Array1<f32>,
    /// Layer norm epsilon.
    layer_norm_eps: f32,
}

impl CpuBertFfn {
    /// FFN forward pass: intermediate projection -> GELU -> output projection -> residual + `LayerNorm`.
    ///
    /// `hidden` is `[seq, hidden]`.
    fn forward(&self, hidden: &Array2<f32>) -> Array2<f32> {
        let seq = hidden.shape()[0];
        let hidden_dim = hidden.shape()[1];

        // Intermediate projection: [seq, hidden] @ [hidden, intermediate] => [seq, intermediate]
        let intermediate = hidden.dot(&self.intermediate_weight.t());
        let intermediate = if let Some(ref bias) = self.intermediate_bias {
            intermediate + bias
        } else {
            intermediate
        };

        let activated = intermediate.mapv(gelu);

        // Output projection: [seq, intermediate] @ [intermediate, hidden] => [seq, hidden]
        let output = activated.dot(&self.output_weight.t());
        let output = if let Some(ref bias) = self.output_bias {
            output + bias
        } else {
            output
        };

        // Residual + LayerNorm
        let residual = hidden + &output;
        let mut result = Array2::<f32>::zeros((seq, hidden_dim));
        for t in 0..seq {
            let normed = layer_norm(
                &residual.row(t),
                &self.output_ln_weight,
                &self.output_ln_bias,
                self.layer_norm_eps,
            );
            result.row_mut(t).assign(&normed);
        }

        result
    }
}

// ---------------------------------------------------------------------------
// Encoder layer
// ---------------------------------------------------------------------------

/// A single BERT encoder layer (self-attention + FFN).
#[derive(Debug)]
struct CpuBertLayer {
    /// Self-attention sub-layer.
    attention: CpuBertSelfAttention,
    /// Feed-forward sub-layer.
    ffn: CpuBertFfn,
}

impl CpuBertLayer {
    /// Run attention then FFN, returning updated hidden states `[seq, hidden]`.
    fn forward(&self, hidden: &Array2<f32>, mask: &Array1<f32>) -> crate::Result<Array2<f32>> {
        let after_attn = self.attention.forward(hidden, mask)?;
        Ok(self.ffn.forward(&after_attn))
    }
}

// ---------------------------------------------------------------------------
// Full model
// ---------------------------------------------------------------------------

/// Complete BERT model for embedding extraction.
#[derive(Debug)]
struct CpuBertModel {
    /// Embeddings layer (word + position + `token_type` + `LayerNorm`).
    embeddings: CpuBertEmbeddings,
    /// Transformer encoder layers.
    layers: Vec<CpuBertLayer>,
}

impl CpuBertModel {
    /// Run the full BERT forward pass for a single sequence.
    ///
    /// Chains: embeddings -> N attention+FFN layers -> hidden states `[seq, hidden]`.
    fn forward(&self, encoding: &Encoding, mask: &Array1<f32>) -> crate::Result<Array2<f32>> {
        let batched = self.embeddings.forward(std::slice::from_ref(encoding));
        // embeddings.forward returns Vec<Array2> — take the single item
        let mut hidden = batched.into_iter().next().ok_or_else(|| {
            crate::Error::Other(anyhow::anyhow!("embeddings produced empty output"))
        })?;

        for layer in &self.layers {
            hidden = layer.forward(&hidden, mask)?;
        }

        Ok(hidden)
    }
}

/// Load `ClassicBert` encoder layers from safetensors.
fn load_classic_layer(
    tensors: &SafeTensors<'_>,
    i: i32,
    config: &BertConfig,
) -> crate::Result<(CpuBertSelfAttention, CpuBertFfn)> {
    let prefix = format!("encoder.layer.{i}");

    // Load separate Q/K/V weights then fuse via concatenation
    let q_weight = load_tensor2(tensors, &format!("{prefix}.attention.self.query.weight"))?;
    let k_weight = load_tensor2(tensors, &format!("{prefix}.attention.self.key.weight"))?;
    let v_weight = load_tensor2(tensors, &format!("{prefix}.attention.self.value.weight"))?;
    let qkv_weight = ndarray::concatenate(
        Axis(0),
        &[q_weight.view(), k_weight.view(), v_weight.view()],
    )
    .map_err(|e| crate::Error::Other(anyhow::anyhow!("QKV concat error layer {i}: {e}")))?;

    // Fuse biases if present
    let q_bias = try_load_tensor1(tensors, &format!("{prefix}.attention.self.query.bias"))?;
    let k_bias = try_load_tensor1(tensors, &format!("{prefix}.attention.self.key.bias"))?;
    let v_bias = try_load_tensor1(tensors, &format!("{prefix}.attention.self.value.bias"))?;
    let qkv_bias = match (&q_bias, &k_bias, &v_bias) {
        (Some(qb), Some(kb), Some(vb)) => Some(
            ndarray::concatenate(Axis(0), &[qb.view(), kb.view(), vb.view()]).map_err(|e| {
                crate::Error::Other(anyhow::anyhow!("QKV bias concat error layer {i}: {e}"))
            })?,
        ),
        _ => None,
    };

    let attention = CpuBertSelfAttention {
        qkv_weight,
        qkv_bias,
        output_weight: load_tensor2(tensors, &format!("{prefix}.attention.output.dense.weight"))?,
        output_bias: try_load_tensor1(tensors, &format!("{prefix}.attention.output.dense.bias"))?,
        output_ln_weight: load_tensor1(
            tensors,
            &format!("{prefix}.attention.output.LayerNorm.weight"),
        )?,
        output_ln_bias: load_tensor1(
            tensors,
            &format!("{prefix}.attention.output.LayerNorm.bias"),
        )?,
        num_heads: config.num_attention_heads,
        head_dim: config.hidden_size / config.num_attention_heads,
        layer_norm_eps: config.layer_norm_eps,
    };
    let ffn = CpuBertFfn {
        intermediate_weight: load_tensor2(tensors, &format!("{prefix}.intermediate.dense.weight"))?,
        intermediate_bias: try_load_tensor1(tensors, &format!("{prefix}.intermediate.dense.bias"))?,
        output_weight: load_tensor2(tensors, &format!("{prefix}.output.dense.weight"))?,
        output_bias: try_load_tensor1(tensors, &format!("{prefix}.output.dense.bias"))?,
        output_ln_weight: load_tensor1(tensors, &format!("{prefix}.output.LayerNorm.weight"))?,
        output_ln_bias: load_tensor1(tensors, &format!("{prefix}.output.LayerNorm.bias"))?,
        layer_norm_eps: config.layer_norm_eps,
    };
    Ok((attention, ffn))
}

impl CpuBertModel {
    /// Load model weights from a safetensors file.
    ///
    /// Parses all weights into `ndarray` arrays. Fuses separate Q/K/V
    /// weight matrices into a single `[3*hidden, hidden]` matrix for
    /// `ClassicBert` (matching the MLX backend pattern).
    #[expect(
        clippy::cast_possible_truncation,
        clippy::cast_sign_loss,
        clippy::cast_possible_wrap,
        reason = "hidden_size and num_layers are small positive ints from config"
    )]
    fn from_safetensors(tensors: &SafeTensors<'_>, config: &BertConfig) -> crate::Result<Self> {
        let embeddings = CpuBertEmbeddings {
            word_embeddings: load_tensor2(tensors, "embeddings.word_embeddings.weight")?,
            position_embeddings: Some(load_tensor2(
                tensors,
                "embeddings.position_embeddings.weight",
            )?),
            token_type_embeddings: Some(load_tensor2(
                tensors,
                "embeddings.token_type_embeddings.weight",
            )?),
            layer_norm_weight: load_tensor1(tensors, "embeddings.LayerNorm.weight")?,
            layer_norm_bias: load_tensor1(tensors, "embeddings.LayerNorm.bias")?,
            layer_norm_eps: config.layer_norm_eps,
        };

        let emb_dim = embeddings.word_embeddings.shape()[1] as i32;
        if emb_dim != config.hidden_size {
            return Err(crate::Error::Other(anyhow::anyhow!(
                "model hidden_size mismatch: config says {} but word_embeddings has dim {}",
                config.hidden_size,
                emb_dim
            )));
        }

        let mut layers = Vec::with_capacity(config.num_hidden_layers as usize);
        for i in 0..config.num_hidden_layers {
            let (attention, ffn) = load_classic_layer(tensors, i, config)?;
            layers.push(CpuBertLayer { attention, ffn });
        }

        Ok(Self { embeddings, layers })
    }
}

// ---------------------------------------------------------------------------
// Public backend wrapper
// ---------------------------------------------------------------------------

/// CPU-based BERT embedding backend using ndarray + system BLAS.
///
/// Uses [`ndarray`] with system BLAS for matrix operations. All computation
/// runs on the CPU, making this backend portable to any platform with a
/// system BLAS library (`OpenBLAS`, MKL, Accelerate).
///
/// Supports the `ClassicBert` family (BGE models), detected automatically
/// from weight names.
///
/// The inner model is wrapped in [`Arc`] so the backend can be cheaply
/// cloned for per-thread use in rayon.
pub struct CpuBackend {
    /// The BERT model (all weights as ndarray arrays).
    model: Arc<CpuBertModel>,
    /// Hidden dimension for output vector size validation.
    hidden_size: i32,
    /// Maximum sequence length supported by the model.
    max_position_embeddings: i32,
}

impl std::fmt::Debug for CpuBackend {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CpuBackend")
            .field("hidden_size", &self.hidden_size)
            .field("max_position_embeddings", &self.max_position_embeddings)
            .finish_non_exhaustive()
    }
}

impl CpuBackend {
    /// Load a `ClassicBert` (BGE) embedding model from `HuggingFace`.
    ///
    /// Downloads `model.safetensors` and `config.json` on first call;
    /// subsequent calls use the `hf-hub` cache. Returns an error if the
    /// model architecture is not recognized.
    ///
    /// # Errors
    ///
    /// Returns an error if the model cannot be downloaded, the config
    /// cannot be parsed, or the weights fail to load.
    pub fn load(model_repo: &str, _device_hint: &DeviceHint) -> crate::Result<Self> {
        // Report BLAS status and recommend optimal library for this CPU
        let blas = super::blas_info::detect_blas();
        let cpu = super::blas_info::detect_cpu_vendor();
        tracing::info!("CPU backend: {} CPU, {} BLAS", cpu, blas);
        if let Some(tip) = super::blas_info::recommend_blas() {
            eprintln!("[ripvec] {tip}");
        }

        let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
        let repo = api.model(model_repo.to_string());

        let config_path = repo
            .get("config.json")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        let weights_path = repo
            .get("model.safetensors")
            .map_err(|e| crate::Error::Download(e.to_string()))?;

        let model_bytes = std::fs::read(&weights_path).map_err(|e| crate::Error::Io {
            path: weights_path.display().to_string(),
            source: e,
        })?;

        let tensors = SafeTensors::deserialize(&model_bytes)
            .map_err(|e| crate::Error::Other(anyhow::anyhow!("safetensors parse error: {e}")))?;
        detect_variant(&tensors)?;

        let config_str = std::fs::read_to_string(&config_path).map_err(|e| crate::Error::Io {
            path: config_path.display().to_string(),
            source: e,
        })?;
        let config_json: serde_json::Value = serde_json::from_str(&config_str)
            .map_err(|e| crate::Error::Other(anyhow::anyhow!("config parse error: {e}")))?;
        let config = BertConfig::from_json(&config_json)?;

        let hidden_size = config.hidden_size;
        let max_position_embeddings = config.max_position_embeddings;
        let model = CpuBertModel::from_safetensors(&tensors, &config)?;

        Ok(Self {
            model: Arc::new(model),
            hidden_size,
            max_position_embeddings,
        })
    }
}

impl EmbedBackend for CpuBackend {
    /// Embed a batch of pre-tokenized inputs using the full BERT forward pass.
    ///
    /// Runs: embeddings -> N attention+FFN layers -> CLS pooling -> L2 normalize.
    /// Each batch item is processed independently (no cross-batch padding needed).
    fn embed_batch(&self, encodings: &[Encoding]) -> crate::Result<Vec<Vec<f32>>> {
        if encodings.is_empty() {
            return Ok(vec![]);
        }

        let mut results = Vec::with_capacity(encodings.len());
        for enc in encodings {
            // Build attention mask: 0.0 for real tokens, -1e9 for padding
            let mask = Array1::from_vec(
                enc.attention_mask
                    .iter()
                    .map(|&m| if m == 1 { 0.0_f32 } else { -1e9_f32 })
                    .collect(),
            );

            // Full forward pass: embeddings -> layers -> hidden [seq, hidden]
            let hidden = self.model.forward(enc, &mask)?;

            // CLS pooling (first token) + L2 normalize
            let cls = hidden.row(0);
            let norm = cls.mapv(|v| v * v).sum().sqrt().max(1e-12);
            let normalized: Vec<f32> = cls.iter().map(|&v| v / norm).collect();
            results.push(normalized);
        }

        Ok(results)
    }

    /// CPU backend supports cheap cloning via `Arc`.
    fn supports_clone(&self) -> bool {
        true
    }

    /// Clone the backend for per-thread use in rayon.
    fn clone_backend(&self) -> Box<dyn EmbedBackend> {
        Box::new(Self {
            model: Arc::clone(&self.model),
            hidden_size: self.hidden_size,
            max_position_embeddings: self.max_position_embeddings,
        })
    }

    /// CPU backend does not use GPU.
    fn is_gpu(&self) -> bool {
        false
    }

    /// Maximum tokens from model config (512 for `ClassicBert`).
    #[expect(
        clippy::cast_sign_loss,
        reason = "max_position_embeddings is always positive from config"
    )]
    fn max_tokens(&self) -> usize {
        self.max_position_embeddings as usize
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    const BGE_SMALL: &str = "BAAI/bge-small-en-v1.5";

    #[test]
    fn config_from_json_classic() {
        let json: serde_json::Value = serde_json::json!({
            "hidden_size": 384,
            "num_hidden_layers": 12,
            "num_attention_heads": 12,
            "max_position_embeddings": 512,
            "layer_norm_eps": 1e-12
        });
        let config = BertConfig::from_json(&json).unwrap();
        assert_eq!(config.hidden_size, 384);
        assert_eq!(config.num_hidden_layers, 12);
        assert_eq!(config.num_attention_heads, 12);
        assert_eq!(config.max_position_embeddings, 512);
    }

    #[test]
    fn config_missing_key_errors() {
        let json: serde_json::Value = serde_json::json!({});
        let result = BertConfig::from_json(&json);
        assert!(result.is_err());
    }

    #[test]
    fn cpu_backend_loads_model() {
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        assert_eq!(backend.hidden_size, 384);
        assert_eq!(backend.max_position_embeddings, 512);
        assert!(!backend.is_gpu());
        assert!(backend.supports_clone());
        assert_eq!(backend.max_tokens(), 512);
    }

    #[test]
    fn cpu_backend_embeddings_forward() {
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        let enc = Encoding {
            input_ids: vec![101, 2023, 2003, 1037, 3231, 102],
            attention_mask: vec![1, 1, 1, 1, 1, 1],
            token_type_ids: vec![0, 0, 0, 0, 0, 0],
        };
        let outputs = backend.model.embeddings.forward(&[enc]);
        assert_eq!(outputs.len(), 1);
        assert_eq!(outputs[0].shape(), &[6, 384]);
        let sum: f32 = outputs[0].iter().map(|v| v.abs()).sum();
        assert!(sum > 0.0, "embeddings output should not be all zeros");
    }

    #[test]
    fn cpu_backend_clone() {
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        let cloned = backend.clone_backend();
        assert!(!cloned.is_gpu());
        assert!(cloned.supports_clone());
        assert_eq!(cloned.max_tokens(), 512);
    }

    #[test]
    fn cpu_backend_full_forward_output_dim() {
        // "hello world" tokenized: [CLS]=101, hello=7592, world=2088, [SEP]=102
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        let enc = Encoding {
            input_ids: vec![101, 7592, 2088, 102],
            attention_mask: vec![1, 1, 1, 1],
            token_type_ids: vec![0, 0, 0, 0],
        };
        let result = backend.embed_batch(&[enc]).unwrap();
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0].len(),
            384,
            "BGE-small should produce 384-dim embeddings"
        );
    }

    #[test]
    fn cpu_backend_full_forward_l2_norm() {
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        let enc = Encoding {
            input_ids: vec![101, 7592, 2088, 102],
            attention_mask: vec![1, 1, 1, 1],
            token_type_ids: vec![0, 0, 0, 0],
        };
        let result = backend.embed_batch(&[enc]).unwrap();
        let norm: f32 = result[0].iter().map(|v| v * v).sum::<f32>().sqrt();
        assert!(
            (norm - 1.0).abs() < 1e-4,
            "L2 norm should be ~1.0, got {norm}"
        );
    }

    #[test]
    fn cpu_backend_different_inputs_differ() {
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        let enc1 = Encoding {
            input_ids: vec![101, 7592, 2088, 102], // "hello world"
            attention_mask: vec![1, 1, 1, 1],
            token_type_ids: vec![0, 0, 0, 0],
        };
        let enc2 = Encoding {
            input_ids: vec![101, 19387, 8840, 4313, 102], // "quantum physics"
            attention_mask: vec![1, 1, 1, 1, 1],
            token_type_ids: vec![0, 0, 0, 0, 0],
        };
        let results = backend.embed_batch(&[enc1, enc2]).unwrap();
        assert_eq!(results.len(), 2);

        // Cosine similarity (both L2-normalized, so dot product = cosine sim)
        let dot: f32 = results[0]
            .iter()
            .zip(results[1].iter())
            .map(|(a, b)| a * b)
            .sum();
        assert!(
            dot < 0.99,
            "different inputs should produce different embeddings, cosine sim = {dot}"
        );
    }

    #[test]
    fn cpu_backend_empty_batch() {
        let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
        let result = backend.embed_batch(&[]).unwrap();
        assert!(result.is_empty(), "empty batch should return empty vec");
    }
}