use crate::embed::{EmbedKind, Embedder};
use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
const BGE_QUERY_PREFIX: &str = "Represent this sentence for searching relevant passages: ";
pub struct FastEmbedder {
model: TextEmbedding,
tag: String,
bge: bool,
}
fn model_for(model: &str) -> Option<(EmbeddingModel, bool)> {
match model {
"bge-small-en-v1.5" => Some((EmbeddingModel::BGESmallENV15, true)),
"bge-base-en-v1.5" => Some((EmbeddingModel::BGEBaseENV15, true)),
"all-MiniLM-L6-v2-q" => Some((EmbeddingModel::AllMiniLML6V2Q, false)),
"all-MiniLM-L6-v2" => Some((EmbeddingModel::AllMiniLML6V2, false)),
_ => None,
}
}
impl FastEmbedder {
pub fn recognized(model: &str) -> bool {
model_for(model).is_some()
}
pub fn try_for(model: &str) -> anyhow::Result<Option<Self>> {
let Some((em, bge)) = model_for(model) else {
return Ok(None);
};
let te = TextEmbedding::try_new(
InitOptions::new(em).with_cache_dir(crate::paths::model_cache_dir()),
)?;
Ok(Some(Self {
model: te,
tag: model.to_string(),
bge,
}))
}
}
impl Embedder for FastEmbedder {
fn id(&self) -> String {
self.tag.clone()
}
fn embed(&self, texts: &[String], kind: EmbedKind) -> anyhow::Result<Vec<Vec<f32>>> {
let prepped: Vec<String> = if self.bge && kind == EmbedKind::Query {
texts
.iter()
.map(|t| format!("{BGE_QUERY_PREFIX}{t}"))
.collect()
} else {
texts.to_vec()
};
self.model.embed(prepped, None)
}
fn min_similarity(&self) -> f32 {
0.64
}
fn score_margin(&self) -> f32 {
0.12
}
}