#![cfg(feature = "default-embedder")]
#![allow(
clippy::expect_used,
clippy::unwrap_used,
clippy::missing_panics_doc,
clippy::float_cmp,
clippy::panic,
clippy::print_stderr,
clippy::similar_names
)]
use fathomdb::{BuiltinBgeSmallEmbedder, QueryEmbedder};
#[test]
fn builtin_embedder_identity_reports_bge_small() {
let embedder = BuiltinBgeSmallEmbedder::new();
let id = embedder.identity();
assert_eq!(id.model_identity, "BAAI/bge-small-en-v1.5");
assert_eq!(id.dimension, 384);
assert_eq!(id.normalization_policy, "l2");
assert!(!id.model_version.is_empty());
}
#[test]
fn builtin_embedder_produces_384_dim_l2_normalized_vector() {
let embedder = BuiltinBgeSmallEmbedder::new();
let vector = match embedder.embed_query("ship the quarterly docs") {
Ok(v) => v,
Err(fathomdb::EmbedderError::Unavailable(msg)) => {
eprintln!("skipping: builtin embedder unavailable (likely offline sandbox): {msg}");
return;
}
Err(e) => panic!("unexpected error: {e}"),
};
assert_eq!(vector.len(), 384);
let norm_sq: f32 = vector.iter().map(|x| x * x).sum();
let norm = norm_sq.sqrt();
assert!(
(norm - 1.0).abs() < 1e-4,
"expected L2 norm ~1.0, got {norm}"
);
}
#[test]
fn builtin_embedder_pooling_is_cls_not_mean() {
let embedder = BuiltinBgeSmallEmbedder::new();
let a = match embedder.embed_query("The cat sat on the mat.") {
Ok(v) => v,
Err(fathomdb::EmbedderError::Unavailable(_)) => return, Err(e) => panic!("unexpected: {e}"),
};
let b = embedder
.embed_query("A cat was resting on a mat.")
.expect("paraphrase embed");
let c = embedder
.embed_query("Quantum chromodynamics describes the strong force.")
.expect("unrelated embed");
let sim_ab = cosine(&a, &b);
let sim_ac = cosine(&a, &c);
assert!(
sim_ab > sim_ac + 0.1,
"paraphrases should be clearly closer than unrelated; sim_ab={sim_ab}, sim_ac={sim_ac}"
);
assert!((-1.0001..=1.0001).contains(&sim_ab));
}
#[test]
fn builtin_embedder_deterministic_across_calls() {
let embedder = BuiltinBgeSmallEmbedder::new();
let first = match embedder.embed_query("deterministic test") {
Ok(v) => v,
Err(fathomdb::EmbedderError::Unavailable(_)) => return,
Err(e) => panic!("unexpected: {e}"),
};
let second = embedder
.embed_query("deterministic test")
.expect("second call");
assert_eq!(first.len(), second.len());
for (i, (a, b)) in first.iter().zip(second.iter()).enumerate() {
assert_eq!(
a.to_bits(),
b.to_bits(),
"component {i} differs: {a} vs {b}"
);
}
}
#[test]
#[ignore = "requires sandboxed offline network; hf-hub HF_HUB_OFFLINE is best-effort"]
fn builtin_embedder_offline_without_cache_returns_unavailable() {
let tmp = tempfile::tempdir().expect("tempdir");
unsafe {
std::env::set_var("HF_HOME", tmp.path());
std::env::set_var("HF_HUB_OFFLINE", "1");
}
let embedder = BuiltinBgeSmallEmbedder::new();
match embedder.embed_query("will not resolve") {
Ok(_) => eprintln!(
"warning: hf-hub downloaded weights despite HF_HUB_OFFLINE=1; host is not truly offline"
),
Err(fathomdb::EmbedderError::Unavailable(_)) => {}
Err(other) => panic!("expected Unavailable on offline load failure, got {other:?}"),
}
}
fn cosine(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len());
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}