#[cfg(target_os = "macos")]
extern crate accelerate_src;
extern crate blas_src;
#[cfg(not(target_os = "macos"))]
extern crate openblas_src;
use std::f32::consts::PI;
use std::sync::Arc;
use hf_hub::api::sync::Api;
use ndarray::{Array1, Array2, ArrayView1, Axis, s};
use safetensors::SafeTensors;
use super::{DeviceHint, EmbedBackend, Encoding};
fn detect_variant(tensors: &SafeTensors<'_>) -> crate::Result<()> {
if tensors
.tensor("embeddings.position_embeddings.weight")
.is_ok()
{
Ok(())
} else {
Err(crate::Error::Other(anyhow::anyhow!(
"unrecognized model architecture: no position_embeddings found"
)))
}
}
#[derive(Debug, Clone)]
struct BertConfig {
hidden_size: i32,
num_hidden_layers: i32,
num_attention_heads: i32,
max_position_embeddings: i32,
layer_norm_eps: f32,
}
impl BertConfig {
#[expect(
clippy::cast_possible_truncation,
reason = "config values are small ints/floats that fit in i32/f32"
)]
fn from_json(v: &serde_json::Value) -> crate::Result<Self> {
let get_i32 = |key: &str| -> crate::Result<i32> {
v.get(key)
.and_then(serde_json::Value::as_i64)
.map(|n| n as i32)
.ok_or_else(|| crate::Error::Other(anyhow::anyhow!("missing config key: {key}")))
};
let get_f64 = |key: &str| -> crate::Result<f64> {
v.get(key)
.and_then(serde_json::Value::as_f64)
.ok_or_else(|| crate::Error::Other(anyhow::anyhow!("missing config key: {key}")))
};
let layer_norm_eps =
get_f64("layer_norm_epsilon").or_else(|_| get_f64("layer_norm_eps"))? as f32;
Ok(Self {
hidden_size: get_i32("hidden_size")?,
num_hidden_layers: get_i32("num_hidden_layers")?,
num_attention_heads: get_i32("num_attention_heads")?,
max_position_embeddings: get_i32("max_position_embeddings").unwrap_or(512),
layer_norm_eps,
})
}
}
fn load_tensor2(tensors: &SafeTensors<'_>, name: &str) -> crate::Result<Array2<f32>> {
let tensor = tensors
.tensor(name)
.map_err(|_| crate::Error::Other(anyhow::anyhow!("missing weight: {name}")))?;
let shape = tensor.shape();
if shape.len() != 2 {
return Err(crate::Error::Other(anyhow::anyhow!(
"expected 2D tensor for {name}, got {}D",
shape.len()
)));
}
let data: Vec<f32> = tensor
.data()
.chunks_exact(4)
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
.collect();
Array2::from_shape_vec((shape[0], shape[1]), data)
.map_err(|e| crate::Error::Other(anyhow::anyhow!("shape error for {name}: {e}")))
}
fn load_tensor1(tensors: &SafeTensors<'_>, name: &str) -> crate::Result<Array1<f32>> {
let tensor = tensors
.tensor(name)
.map_err(|_| crate::Error::Other(anyhow::anyhow!("missing weight: {name}")))?;
let shape = tensor.shape();
if shape.len() != 1 {
return Err(crate::Error::Other(anyhow::anyhow!(
"expected 1D tensor for {name}, got {}D",
shape.len()
)));
}
let data: Vec<f32> = tensor
.data()
.chunks_exact(4)
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
.collect();
Ok(Array1::from_vec(data))
}
fn try_load_tensor1(tensors: &SafeTensors<'_>, name: &str) -> crate::Result<Option<Array1<f32>>> {
if tensors.tensor(name).is_ok() {
Ok(Some(load_tensor1(tensors, name)?))
} else {
Ok(None)
}
}
fn layer_norm(
x: &ArrayView1<'_, f32>,
weight: &Array1<f32>,
bias: &Array1<f32>,
eps: f32,
) -> Array1<f32> {
let mean = x.mean().unwrap_or(0.0);
let var = x.mapv(|v| (v - mean).powi(2)).mean().unwrap_or(0.0);
let inv_std = 1.0 / (var + eps).sqrt();
(x.mapv(|v| (v - mean) * inv_std) * weight) + bias
}
fn gelu(x: f32) -> f32 {
x * 0.5 * (1.0 + ((2.0 / PI).sqrt() * (x + 0.044_715 * x.powi(3))).tanh())
}
fn softmax_inplace(vals: &mut [f32]) {
let max = vals.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0_f32;
for v in vals.iter_mut() {
*v = (*v - max).exp();
sum += *v;
}
let inv_sum = 1.0 / sum;
for v in vals.iter_mut() {
*v *= inv_sum;
}
}
#[derive(Debug)]
struct CpuBertEmbeddings {
word_embeddings: Array2<f32>,
position_embeddings: Option<Array2<f32>>,
token_type_embeddings: Option<Array2<f32>>,
layer_norm_weight: Array1<f32>,
layer_norm_bias: Array1<f32>,
layer_norm_eps: f32,
}
impl CpuBertEmbeddings {
#[expect(
clippy::cast_sign_loss,
clippy::cast_possible_truncation,
reason = "token IDs from tokenizer are always non-negative and fit in usize"
)]
fn forward(&self, encodings: &[Encoding]) -> Vec<Array2<f32>> {
let hidden = self.word_embeddings.shape()[1];
encodings
.iter()
.map(|enc| {
let seq_len = enc.input_ids.len();
let mut output = Array2::<f32>::zeros((seq_len, hidden));
for (t, &id) in enc.input_ids.iter().enumerate() {
let word_row = self.word_embeddings.row(id as usize);
output.row_mut(t).assign(&word_row);
if let Some(ref pos_emb) = self.position_embeddings {
let pos_row = pos_emb.row(t);
output.row_mut(t).zip_mut_with(&pos_row, |o, &p| *o += p);
}
if let Some(ref tok_emb) = self.token_type_embeddings {
let type_id = enc.token_type_ids[t] as usize;
let tok_row = tok_emb.row(type_id);
output.row_mut(t).zip_mut_with(&tok_row, |o, &p| *o += p);
}
let normed = layer_norm(
&output.row(t),
&self.layer_norm_weight,
&self.layer_norm_bias,
self.layer_norm_eps,
);
output.row_mut(t).assign(&normed);
}
output
})
.collect()
}
}
#[derive(Debug)]
struct CpuBertSelfAttention {
qkv_weight: Array2<f32>,
qkv_bias: Option<Array1<f32>>,
output_weight: Array2<f32>,
output_bias: Option<Array1<f32>>,
output_ln_weight: Array1<f32>,
output_ln_bias: Array1<f32>,
num_heads: i32,
head_dim: i32,
layer_norm_eps: f32,
}
impl CpuBertSelfAttention {
#[expect(
clippy::cast_sign_loss,
clippy::cast_precision_loss,
reason = "num_heads/head_dim are small positive ints from config"
)]
fn forward(&self, hidden: &Array2<f32>, mask: &Array1<f32>) -> crate::Result<Array2<f32>> {
let seq = hidden.shape()[0];
let nh = self.num_heads as usize;
let hd = self.head_dim as usize;
let hidden_dim = nh * hd;
let qkv = hidden.dot(&self.qkv_weight.t());
let qkv = if let Some(ref bias) = self.qkv_bias {
qkv + bias
} else {
qkv
};
let q = qkv.slice(s![.., 0..hidden_dim]).to_owned();
let k = qkv.slice(s![.., hidden_dim..2 * hidden_dim]).to_owned();
let v = qkv.slice(s![.., 2 * hidden_dim..3 * hidden_dim]).to_owned();
let mut context = Array2::<f32>::zeros((seq, hidden_dim));
for h in 0..nh {
let col_start = h * hd;
let col_end = col_start + hd;
let q_h = q.slice(s![.., col_start..col_end]);
let k_h = k.slice(s![.., col_start..col_end]);
let v_h = v.slice(s![.., col_start..col_end]);
let scale = 1.0 / (hd as f32).sqrt();
let mut scores = q_h.dot(&k_h.t());
scores.mapv_inplace(|v| v * scale);
for mut row in scores.rows_mut() {
row.zip_mut_with(mask, |s, &m| *s += m);
}
for mut row in scores.rows_mut() {
softmax_inplace(row.as_slice_mut().ok_or_else(|| {
crate::Error::Other(anyhow::anyhow!("attention scores not contiguous"))
})?);
}
let ctx_h = scores.dot(&v_h);
context.slice_mut(s![.., col_start..col_end]).assign(&ctx_h);
}
let projected = context.dot(&self.output_weight.t());
let projected = if let Some(ref bias) = self.output_bias {
projected + bias
} else {
projected
};
let residual = hidden + &projected;
let mut output = Array2::<f32>::zeros((seq, hidden_dim));
for t in 0..seq {
let normed = layer_norm(
&residual.row(t),
&self.output_ln_weight,
&self.output_ln_bias,
self.layer_norm_eps,
);
output.row_mut(t).assign(&normed);
}
Ok(output)
}
}
#[derive(Debug)]
struct CpuBertFfn {
intermediate_weight: Array2<f32>,
intermediate_bias: Option<Array1<f32>>,
output_weight: Array2<f32>,
output_bias: Option<Array1<f32>>,
output_ln_weight: Array1<f32>,
output_ln_bias: Array1<f32>,
layer_norm_eps: f32,
}
impl CpuBertFfn {
fn forward(&self, hidden: &Array2<f32>) -> Array2<f32> {
let seq = hidden.shape()[0];
let hidden_dim = hidden.shape()[1];
let intermediate = hidden.dot(&self.intermediate_weight.t());
let intermediate = if let Some(ref bias) = self.intermediate_bias {
intermediate + bias
} else {
intermediate
};
let activated = intermediate.mapv(gelu);
let output = activated.dot(&self.output_weight.t());
let output = if let Some(ref bias) = self.output_bias {
output + bias
} else {
output
};
let residual = hidden + &output;
let mut result = Array2::<f32>::zeros((seq, hidden_dim));
for t in 0..seq {
let normed = layer_norm(
&residual.row(t),
&self.output_ln_weight,
&self.output_ln_bias,
self.layer_norm_eps,
);
result.row_mut(t).assign(&normed);
}
result
}
}
#[derive(Debug)]
struct CpuBertLayer {
attention: CpuBertSelfAttention,
ffn: CpuBertFfn,
}
impl CpuBertLayer {
fn forward(&self, hidden: &Array2<f32>, mask: &Array1<f32>) -> crate::Result<Array2<f32>> {
let after_attn = self.attention.forward(hidden, mask)?;
Ok(self.ffn.forward(&after_attn))
}
}
#[derive(Debug)]
struct CpuBertModel {
embeddings: CpuBertEmbeddings,
layers: Vec<CpuBertLayer>,
}
impl CpuBertModel {
fn forward(&self, encoding: &Encoding, mask: &Array1<f32>) -> crate::Result<Array2<f32>> {
let batched = self.embeddings.forward(std::slice::from_ref(encoding));
let mut hidden = batched.into_iter().next().ok_or_else(|| {
crate::Error::Other(anyhow::anyhow!("embeddings produced empty output"))
})?;
for layer in &self.layers {
hidden = layer.forward(&hidden, mask)?;
}
Ok(hidden)
}
}
fn load_classic_layer(
tensors: &SafeTensors<'_>,
i: i32,
config: &BertConfig,
) -> crate::Result<(CpuBertSelfAttention, CpuBertFfn)> {
let prefix = format!("encoder.layer.{i}");
let q_weight = load_tensor2(tensors, &format!("{prefix}.attention.self.query.weight"))?;
let k_weight = load_tensor2(tensors, &format!("{prefix}.attention.self.key.weight"))?;
let v_weight = load_tensor2(tensors, &format!("{prefix}.attention.self.value.weight"))?;
let qkv_weight = ndarray::concatenate(
Axis(0),
&[q_weight.view(), k_weight.view(), v_weight.view()],
)
.map_err(|e| crate::Error::Other(anyhow::anyhow!("QKV concat error layer {i}: {e}")))?;
let q_bias = try_load_tensor1(tensors, &format!("{prefix}.attention.self.query.bias"))?;
let k_bias = try_load_tensor1(tensors, &format!("{prefix}.attention.self.key.bias"))?;
let v_bias = try_load_tensor1(tensors, &format!("{prefix}.attention.self.value.bias"))?;
let qkv_bias = match (&q_bias, &k_bias, &v_bias) {
(Some(qb), Some(kb), Some(vb)) => Some(
ndarray::concatenate(Axis(0), &[qb.view(), kb.view(), vb.view()]).map_err(|e| {
crate::Error::Other(anyhow::anyhow!("QKV bias concat error layer {i}: {e}"))
})?,
),
_ => None,
};
let attention = CpuBertSelfAttention {
qkv_weight,
qkv_bias,
output_weight: load_tensor2(tensors, &format!("{prefix}.attention.output.dense.weight"))?,
output_bias: try_load_tensor1(tensors, &format!("{prefix}.attention.output.dense.bias"))?,
output_ln_weight: load_tensor1(
tensors,
&format!("{prefix}.attention.output.LayerNorm.weight"),
)?,
output_ln_bias: load_tensor1(
tensors,
&format!("{prefix}.attention.output.LayerNorm.bias"),
)?,
num_heads: config.num_attention_heads,
head_dim: config.hidden_size / config.num_attention_heads,
layer_norm_eps: config.layer_norm_eps,
};
let ffn = CpuBertFfn {
intermediate_weight: load_tensor2(tensors, &format!("{prefix}.intermediate.dense.weight"))?,
intermediate_bias: try_load_tensor1(tensors, &format!("{prefix}.intermediate.dense.bias"))?,
output_weight: load_tensor2(tensors, &format!("{prefix}.output.dense.weight"))?,
output_bias: try_load_tensor1(tensors, &format!("{prefix}.output.dense.bias"))?,
output_ln_weight: load_tensor1(tensors, &format!("{prefix}.output.LayerNorm.weight"))?,
output_ln_bias: load_tensor1(tensors, &format!("{prefix}.output.LayerNorm.bias"))?,
layer_norm_eps: config.layer_norm_eps,
};
Ok((attention, ffn))
}
impl CpuBertModel {
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
clippy::cast_possible_wrap,
reason = "hidden_size and num_layers are small positive ints from config"
)]
fn from_safetensors(tensors: &SafeTensors<'_>, config: &BertConfig) -> crate::Result<Self> {
let embeddings = CpuBertEmbeddings {
word_embeddings: load_tensor2(tensors, "embeddings.word_embeddings.weight")?,
position_embeddings: Some(load_tensor2(
tensors,
"embeddings.position_embeddings.weight",
)?),
token_type_embeddings: Some(load_tensor2(
tensors,
"embeddings.token_type_embeddings.weight",
)?),
layer_norm_weight: load_tensor1(tensors, "embeddings.LayerNorm.weight")?,
layer_norm_bias: load_tensor1(tensors, "embeddings.LayerNorm.bias")?,
layer_norm_eps: config.layer_norm_eps,
};
let emb_dim = embeddings.word_embeddings.shape()[1] as i32;
if emb_dim != config.hidden_size {
return Err(crate::Error::Other(anyhow::anyhow!(
"model hidden_size mismatch: config says {} but word_embeddings has dim {}",
config.hidden_size,
emb_dim
)));
}
let mut layers = Vec::with_capacity(config.num_hidden_layers as usize);
for i in 0..config.num_hidden_layers {
let (attention, ffn) = load_classic_layer(tensors, i, config)?;
layers.push(CpuBertLayer { attention, ffn });
}
Ok(Self { embeddings, layers })
}
}
pub struct CpuBackend {
model: Arc<CpuBertModel>,
hidden_size: i32,
max_position_embeddings: i32,
}
impl std::fmt::Debug for CpuBackend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CpuBackend")
.field("hidden_size", &self.hidden_size)
.field("max_position_embeddings", &self.max_position_embeddings)
.finish_non_exhaustive()
}
}
impl CpuBackend {
pub fn load(model_repo: &str, _device_hint: &DeviceHint) -> crate::Result<Self> {
let blas = super::blas_info::detect_blas();
let cpu = super::blas_info::detect_cpu_vendor();
tracing::info!("CPU backend: {} CPU, {} BLAS", cpu, blas);
if let Some(tip) = super::blas_info::recommend_blas() {
eprintln!("[ripvec] {tip}");
}
let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
let repo = api.model(model_repo.to_string());
let config_path = repo
.get("config.json")
.map_err(|e| crate::Error::Download(e.to_string()))?;
let weights_path = repo
.get("model.safetensors")
.map_err(|e| crate::Error::Download(e.to_string()))?;
let model_bytes = std::fs::read(&weights_path).map_err(|e| crate::Error::Io {
path: weights_path.display().to_string(),
source: e,
})?;
let tensors = SafeTensors::deserialize(&model_bytes)
.map_err(|e| crate::Error::Other(anyhow::anyhow!("safetensors parse error: {e}")))?;
detect_variant(&tensors)?;
let config_str = std::fs::read_to_string(&config_path).map_err(|e| crate::Error::Io {
path: config_path.display().to_string(),
source: e,
})?;
let config_json: serde_json::Value = serde_json::from_str(&config_str)
.map_err(|e| crate::Error::Other(anyhow::anyhow!("config parse error: {e}")))?;
let config = BertConfig::from_json(&config_json)?;
let hidden_size = config.hidden_size;
let max_position_embeddings = config.max_position_embeddings;
let model = CpuBertModel::from_safetensors(&tensors, &config)?;
Ok(Self {
model: Arc::new(model),
hidden_size,
max_position_embeddings,
})
}
}
impl EmbedBackend for CpuBackend {
fn embed_batch(&self, encodings: &[Encoding]) -> crate::Result<Vec<Vec<f32>>> {
if encodings.is_empty() {
return Ok(vec![]);
}
let mut results = Vec::with_capacity(encodings.len());
for enc in encodings {
let mask = Array1::from_vec(
enc.attention_mask
.iter()
.map(|&m| if m == 1 { 0.0_f32 } else { -1e9_f32 })
.collect(),
);
let hidden = self.model.forward(enc, &mask)?;
let cls = hidden.row(0);
let norm = cls.mapv(|v| v * v).sum().sqrt().max(1e-12);
let normalized: Vec<f32> = cls.iter().map(|&v| v / norm).collect();
results.push(normalized);
}
Ok(results)
}
fn supports_clone(&self) -> bool {
true
}
fn clone_backend(&self) -> Box<dyn EmbedBackend> {
Box::new(Self {
model: Arc::clone(&self.model),
hidden_size: self.hidden_size,
max_position_embeddings: self.max_position_embeddings,
})
}
fn is_gpu(&self) -> bool {
false
}
#[expect(
clippy::cast_sign_loss,
reason = "max_position_embeddings is always positive from config"
)]
fn max_tokens(&self) -> usize {
self.max_position_embeddings as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
const BGE_SMALL: &str = "BAAI/bge-small-en-v1.5";
#[test]
fn config_from_json_classic() {
let json: serde_json::Value = serde_json::json!({
"hidden_size": 384,
"num_hidden_layers": 12,
"num_attention_heads": 12,
"max_position_embeddings": 512,
"layer_norm_eps": 1e-12
});
let config = BertConfig::from_json(&json).unwrap();
assert_eq!(config.hidden_size, 384);
assert_eq!(config.num_hidden_layers, 12);
assert_eq!(config.num_attention_heads, 12);
assert_eq!(config.max_position_embeddings, 512);
}
#[test]
fn config_missing_key_errors() {
let json: serde_json::Value = serde_json::json!({});
let result = BertConfig::from_json(&json);
assert!(result.is_err());
}
#[test]
fn cpu_backend_loads_model() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
assert_eq!(backend.hidden_size, 384);
assert_eq!(backend.max_position_embeddings, 512);
assert!(!backend.is_gpu());
assert!(backend.supports_clone());
assert_eq!(backend.max_tokens(), 512);
}
#[test]
fn cpu_backend_embeddings_forward() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
let enc = Encoding {
input_ids: vec![101, 2023, 2003, 1037, 3231, 102],
attention_mask: vec![1, 1, 1, 1, 1, 1],
token_type_ids: vec![0, 0, 0, 0, 0, 0],
};
let outputs = backend.model.embeddings.forward(&[enc]);
assert_eq!(outputs.len(), 1);
assert_eq!(outputs[0].shape(), &[6, 384]);
let sum: f32 = outputs[0].iter().map(|v| v.abs()).sum();
assert!(sum > 0.0, "embeddings output should not be all zeros");
}
#[test]
fn cpu_backend_clone() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
let cloned = backend.clone_backend();
assert!(!cloned.is_gpu());
assert!(cloned.supports_clone());
assert_eq!(cloned.max_tokens(), 512);
}
#[test]
fn cpu_backend_full_forward_output_dim() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
let enc = Encoding {
input_ids: vec![101, 7592, 2088, 102],
attention_mask: vec![1, 1, 1, 1],
token_type_ids: vec![0, 0, 0, 0],
};
let result = backend.embed_batch(&[enc]).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(
result[0].len(),
384,
"BGE-small should produce 384-dim embeddings"
);
}
#[test]
fn cpu_backend_full_forward_l2_norm() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
let enc = Encoding {
input_ids: vec![101, 7592, 2088, 102],
attention_mask: vec![1, 1, 1, 1],
token_type_ids: vec![0, 0, 0, 0],
};
let result = backend.embed_batch(&[enc]).unwrap();
let norm: f32 = result[0].iter().map(|v| v * v).sum::<f32>().sqrt();
assert!(
(norm - 1.0).abs() < 1e-4,
"L2 norm should be ~1.0, got {norm}"
);
}
#[test]
fn cpu_backend_different_inputs_differ() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
let enc1 = Encoding {
input_ids: vec![101, 7592, 2088, 102], attention_mask: vec![1, 1, 1, 1],
token_type_ids: vec![0, 0, 0, 0],
};
let enc2 = Encoding {
input_ids: vec![101, 19387, 8840, 4313, 102], attention_mask: vec![1, 1, 1, 1, 1],
token_type_ids: vec![0, 0, 0, 0, 0],
};
let results = backend.embed_batch(&[enc1, enc2]).unwrap();
assert_eq!(results.len(), 2);
let dot: f32 = results[0]
.iter()
.zip(results[1].iter())
.map(|(a, b)| a * b)
.sum();
assert!(
dot < 0.99,
"different inputs should produce different embeddings, cosine sim = {dot}"
);
}
#[test]
fn cpu_backend_empty_batch() {
let backend = CpuBackend::load(BGE_SMALL, &DeviceHint::Cpu).unwrap();
let result = backend.embed_batch(&[]).unwrap();
assert!(result.is_empty(), "empty batch should return empty vec");
}
}