1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
use kalosm_common::BoxedFuture;
use crate::embedding::{Embedding, VectorSpace};
use crate::UnknownVectorSpace;
/// A model that can be used to embed text. This trait is generic over the vector space that the model uses to help keep track of what embeddings came from which model.
///
/// # Example
///
/// ```rust, no_run
/// use kalosm_language_model::Embedder;
/// use kalosm::language::*;
///
/// #[tokio::main]
/// async fn main() {
/// // Bert implements Embedder
/// let mut bert = Bert::new().await.unwrap();
/// let sentences = [
/// "Cats are cool",
/// "The geopolitical situation is dire",
/// "Pets are great",
/// "Napoleon was a tyrant",
/// "Napoleon was a great general",
/// ];
/// // Embed a batch of documents into the bert vector space
/// let embeddings = bert.embed_batch(sentences).await.unwrap();
/// println!("embeddings {:?}", embeddings);
/// }
/// ```
pub trait Embedder: Send + Sync + 'static {
/// The vector space that this embedder uses.
type VectorSpace: VectorSpace + Send + Sync + 'static;
/// Embed some text into a vector space.
fn embed_string(
&self,
input: String,
) -> BoxedFuture<'_, anyhow::Result<Embedding<Self::VectorSpace>>> {
self.embed_for(EmbeddingInput {
text: input,
variant: EmbeddingVariant::Document,
})
}
/// Embed a [`Vec<String>`] into a vector space. Returns a list of embeddings in the same order as the inputs.
fn embed_vec(
&self,
inputs: Vec<String>,
) -> BoxedFuture<'_, anyhow::Result<Vec<Embedding<Self::VectorSpace>>>> {
Box::pin(async move {
let mut embeddings = Vec::with_capacity(inputs.len());
for input in inputs {
embeddings.push(self.embed_string(input).await?);
}
Ok(embeddings)
})
}
/// Embed a [`EmbeddingInput`] into a vector space
fn embed_for(
&self,
input: EmbeddingInput,
) -> BoxedFuture<'_, anyhow::Result<Embedding<Self::VectorSpace>>>;
/// Embed a [`Vec<String>`] into a vector space. Returns a list of embeddings in the same order as the inputs.
fn embed_vec_for(
&self,
inputs: Vec<EmbeddingInput>,
) -> BoxedFuture<'_, anyhow::Result<Vec<Embedding<Self::VectorSpace>>>> {
Box::pin(async move {
let mut embeddings = Vec::with_capacity(inputs.len());
for input in inputs {
embeddings.push(self.embed_for(input).await?);
}
Ok(embeddings)
})
}
}
/// The input to an embedding model. This includes the text to be embedded and the type of embedding to output.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct EmbeddingInput {
/// The text to embed.
pub text: String,
/// The type of embedding to embed the text into.
pub variant: EmbeddingVariant,
}
impl EmbeddingInput {
/// Create a new embedding input.
pub fn new(text: impl ToString, variant: EmbeddingVariant) -> Self {
Self {
text: text.to_string(),
variant,
}
}
}
/// The type of embedding the model should output. For models that output different embeddings for queries and documents, this
///
/// For most models, the type will not effect the output.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum EmbeddingVariant {
/// The model should output an embedding for a query.
Query,
/// The model should output an embedding for documents.
#[default]
Document,
}
/// An extension trait for [`Embedder`] with helper methods for iterators, and types that can be converted into a string.
///
/// This trait is automatically implemented for any item that implements [`Embedder`].
pub trait EmbedderExt: Embedder {
/// Convert this embedder into an embedder trait object.
fn into_any_embedder(self) -> DynEmbedder
where
Self: Sized,
{
Box::new(AnyEmbedder::<Self>(self))
}
/// Embed some text into a vector space
fn embed(
&self,
input: impl ToString,
) -> BoxedFuture<'_, anyhow::Result<Embedding<Self::VectorSpace>>> {
self.embed_string(input.to_string())
}
/// Embed a query into a vector space
fn embed_query(
&self,
input: impl ToString,
) -> BoxedFuture<'_, anyhow::Result<Embedding<Self::VectorSpace>>> {
self.embed_for(EmbeddingInput {
text: input.to_string(),
variant: EmbeddingVariant::Query,
})
}
/// Embed a batch of text into a vector space. Returns a list of embeddings in the same order as the inputs.
fn embed_batch(
&self,
inputs: impl IntoIterator<Item = impl ToString>,
) -> BoxedFuture<'_, anyhow::Result<Vec<Embedding<Self::VectorSpace>>>> {
let inputs = inputs
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
self.embed_vec(inputs)
}
/// Embed a batch of [`EmbeddingInput`] into a vector space. Returns a list of embeddings in the same order as the inputs.
fn embed_batch_for(
&self,
inputs: impl IntoIterator<Item = EmbeddingInput>,
) -> BoxedFuture<'_, anyhow::Result<Vec<Embedding<Self::VectorSpace>>>> {
self.embed_vec_for(inputs.into_iter().collect())
}
}
impl<E: Embedder> EmbedderExt for E {}
/// A trait object for an embedder.
pub type DynEmbedder = Box<dyn Embedder<VectorSpace = UnknownVectorSpace>>;
struct AnyEmbedder<E: Embedder + Send + Sync + 'static>(E);
impl<E: Embedder + Send + Sync + 'static> Embedder for AnyEmbedder<E> {
type VectorSpace = UnknownVectorSpace;
fn embed_string(
&self,
input: String,
) -> BoxedFuture<'_, anyhow::Result<Embedding<UnknownVectorSpace>>> {
let future = self.0.embed_string(input);
Box::pin(async move { future.await.map(|e| e.cast()) })
}
fn embed_vec(
&self,
inputs: Vec<String>,
) -> BoxedFuture<'_, anyhow::Result<Vec<Embedding<UnknownVectorSpace>>>> {
let future = self.0.embed_vec(inputs);
Box::pin(async move {
future
.await
.map(|e| e.into_iter().map(|e| e.cast()).collect())
})
}
fn embed_for(
&self,
input: EmbeddingInput,
) -> BoxedFuture<'_, anyhow::Result<Embedding<UnknownVectorSpace>>> {
let future = self.0.embed_for(input);
Box::pin(async move { future.await.map(|e| e.cast()) })
}
fn embed_vec_for(
&self,
inputs: Vec<EmbeddingInput>,
) -> BoxedFuture<'_, anyhow::Result<Vec<Embedding<Self::VectorSpace>>>> {
let future = self.0.embed_vec_for(inputs);
Box::pin(async move {
future
.await
.map(|e| e.into_iter().map(|e| e.cast()).collect())
})
}
}