aleph_alpha_api/
embedding.rs

1use super::completion::{Hosting, Prompt};
2use crate::impl_builder_methods;
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6#[derive(Serialize, Debug, Default)]
7pub struct EmbeddingRequest {
8    /// Name of model to use. A model name refers to a model architecture (number of parameters among others). Always the latest version of model is used. The model output contains information as to the model version.
9    pub model: String,
10
11    /// Possible values: [aleph-alpha, None]
12    /// Optional parameter that specifies which datacenters may process the request. You can either set the
13    /// parameter to "aleph-alpha" or omit it (defaulting to null).
14    /// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
15    /// request in our own datacenters and on servers hosted with other providers. Choose this option for
16    /// maximum availability.
17    /// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
18    /// option for maximal data privacy.
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub hosting: Option<Hosting>,
21
22    /// This field is used to send prompts to the model. A prompt can either be a text prompt or a multimodal prompt. A text prompt is a string of text. A multimodal prompt is an array of prompt items. It can be a combination of text, images, and token ID arrays.
23    /// In the case of a multimodal prompt, the prompt items will be concatenated and a single prompt will be used for the model.
24    /// Tokenization:
25    /// Token ID arrays are used as as-is.
26    /// Text prompt items are tokenized using the tokenizers specific to the model.
27    /// Each image is converted into 144 tokens.
28    pub prompt: Prompt,
29
30    /// A list of layer indices from which to return embeddings.
31    /// - Index 0 corresponds to the word embeddings used as input to the first transformer layer
32    /// - Index 1 corresponds to the hidden state as output by the first transformer layer, index 2 to the output of the second layer etc.
33    /// - Index -1 corresponds to the last transformer layer (not the language modelling head), index -2 to the second last
34    pub layers: Vec<i32>,
35
36    /// Flag indicating whether the tokenized prompt is to be returned (True) or not (False)
37    #[serde(skip_serializing_if = "Option::is_none")]
38    pub tokens: Option<bool>,
39
40    /// Pooling operation to use. Pooling operations include:
41    /// - "mean": Aggregate token embeddings across the sequence dimension using an average.
42    /// - "weighted_mean": Position weighted mean across sequence dimension with latter tokens having a higher weight.
43    /// - "max": Aggregate token embeddings across the sequence dimension using a maximum.
44    /// - "last_token": Use the last token.
45    /// - "abs_max": Aggregate token embeddings across the sequence dimension using a maximum of absolute values.
46    pub pooling: Vec<String>,
47
48    /// Explicitly set embedding type to be passed to the model. This parameter was created to allow for semantic_embed embeddings and will be deprecated. Please use the semantic_embed-endpoint instead.
49    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
50    // type is a reserved word in Rust
51    pub embedding_type: Option<String>,
52
53    /// Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric.
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub normalize: Option<bool>,
56
57    /// If set to `None`, attention control parameters only apply to those tokens that have explicitly been set
58    /// in the request. If set to a non-null value, we apply the control parameters to similar tokens as
59    /// well. Controls that have been applied to one token will then be applied to all other tokens that have
60    /// at least the similarity score defined by this parameter. The similarity score is the cosine
61    /// similarity of token embeddings.
62    #[serde(skip_serializing_if = "Option::is_none")]
63    pub contextual_control_threshold: Option<f64>,
64
65    /// `true`: apply controls on prompt items by adding the `log(control_factor)` to attention scores.
66    /// `false`: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
67    #[serde(skip_serializing_if = "Option::is_none")]
68    pub control_log_additive: Option<bool>,
69}
70
71impl EmbeddingRequest {
72    pub fn from_text(
73        model: impl Into<String>,
74        prompt: impl Into<String>,
75        layer: i32,
76        pooling: impl Into<String>,
77        normalize: bool,
78    ) -> Self {
79        Self {
80            model: model.into(),
81            prompt: Prompt::from_text(prompt),
82            layers: vec![layer.into()],
83            pooling: vec![pooling.into()],
84            normalize: Some(normalize),
85            ..Self::default()
86        }
87    }
88}
89
90impl_builder_methods!(
91    EmbeddingRequest,
92    tokens: bool,
93    embedding_type: String,
94    normalize: bool,
95    contextual_control_threshold: f64,
96    control_log_additive: bool
97);
98
99type Embedding = Vec<f32>;
100type PoolingEmbeddings = HashMap<String, Embedding>;
101type LayerEmbedings = HashMap<String, PoolingEmbeddings>;
102
103#[derive(Deserialize, Debug)]
104pub struct EmbeddingResponse {
105    /// model name and version (if any) of the used model for inference
106    pub model_version: String,
107
108    /// embeddings:
109    /// - pooling: a dict with layer names as keys and and pooling output as values. A pooling output is a dict with pooling operation as key and a pooled embedding (list of floats) as values
110    pub embeddings: LayerEmbedings,
111
112    pub tokens: Option<Vec<String>>,
113}
114
115/// Type of embedding representation to embed the prompt with.
116///
117/// `"symmetric"`: Symmetric embeddings assume that the text to be compared is interchangeable. Usage examples for symmetric embeddings are clustering, classification, anomaly detection or visualisation tasks. "symmetric" embeddings should be compared with other "symmetric" embeddings.
118///
119/// `"document"` and `"query"`: Asymmetric embeddings assume that there is a difference between queries and documents. They are used together in use cases such as search where you want to compare shorter queries against larger documents.
120///
121/// `"query"`-embeddings are optimized for shorter texts, such as questions or keywords.
122///
123/// `"document"`-embeddings are optimized for larger pieces of text to compare queries against.
124#[derive(Serialize, Debug)]
125#[serde(rename_all = "snake_case")]
126pub enum EmbeddingRepresentation {
127    Symmetric,
128    Document,
129    Query,
130}
131
132impl Default for EmbeddingRepresentation {
133    fn default() -> Self {
134        EmbeddingRepresentation::Symmetric
135    }
136}
137
138/// Embeds a prompt using a specific model and semantic embedding method. Resulting vectors that can be used for downstream tasks (e.g. semantic similarity) and models (e.g. classifiers).
139#[derive(Serialize, Debug, Default)]
140pub struct SemanticEmbeddingRequest {
141    /// Name of the model to use. A model name refers to a model's architecture (number of parameters among others). The most recent version of the model is always used. The model output contains information as to the model version. To create semantic embeddings, please use `luminous-base`.
142    pub model: String,
143
144    /// Possible values: [aleph-alpha, None]
145    /// Optional parameter that specifies which datacenters may process the request. You can either set the
146    /// parameter to "aleph-alpha" or omit it (defaulting to null).
147    /// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
148    /// request in our own datacenters and on servers hosted with other providers. Choose this option for
149    /// maximum availability.
150    /// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
151    /// option for maximal data privacy.
152    #[serde(skip_serializing_if = "Option::is_none")]
153    pub hosting: Option<Hosting>,
154
155    /// This field is used to send prompts to the model. A prompt can either be a text prompt or a multimodal prompt. A text prompt is a string of text. A multimodal prompt is an array of prompt items. It can be a combination of text, images, and token ID arrays.
156    /// In the case of a multimodal prompt, the prompt items will be concatenated and a single prompt will be used for the model.
157    /// Tokenization:
158    /// Token ID arrays are used as as-is.
159    /// Text prompt items are tokenized using the tokenizers specific to the model.
160    /// Each image is converted into 144 tokens.
161    pub prompt: Prompt,
162
163    /// Type of embedding representation to embed the prompt with.
164    pub representation: EmbeddingRepresentation,
165
166    /// The default behavior is to return the full embedding with 5120 dimensions. With this parameter you can compress the returned embedding to 128 dimensions.
167    /// The compression is expected to result in a small drop in accuracy performance (4-6%), with the benefit of being much smaller, which makes comparing these embeddings much faster for use cases where speed is critical.
168    /// With the compressed embedding can also perform better if you are embedding really short texts or documents.
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub compress_to_size: Option<i32>,
171
172    /// Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric.
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub normalize: Option<bool>,
175
176    /// If set to `null`, attention control parameters only apply to those tokens that have explicitly been set in the request.
177    /// If set to a non-null value, we apply the control parameters to similar tokens as well.
178    /// Controls that have been applied to one token will then be applied to all other tokens
179    /// that have at least the similarity score defined by this parameter.
180    /// The similarity score is the cosine similarity of token embeddings.
181    #[serde(skip_serializing_if = "Option::is_none")]
182    pub contextual_control_threshold: Option<f64>,
183
184    /// `true`: apply controls on prompt items by adding the `log(control_factor)` to attention scores.
185    /// `false`: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
186    #[serde(skip_serializing_if = "Option::is_none")]
187    pub control_log_additive: Option<bool>,
188}
189
190impl_builder_methods!(
191    SemanticEmbeddingRequest,
192    hosting: Hosting,
193    compress_to_size: i32,
194    normalize: bool,
195    contextual_control_threshold: f64,
196    control_log_additive: bool
197);
198
199#[derive(Deserialize, Debug)]
200pub struct SemanticEmbeddingResponse {
201    /// model name and version (if any) of the used model for inference
202    pub model_version: String,
203
204    /// A list of floats that can be used to compare against other embeddings.
205    pub embedding: Embedding,
206}
207
208#[derive(Serialize, Debug, Default)]
209pub struct BatchSemanticEmbeddingRequest {
210    /// Name of the model to use. A model name refers to a model's architecture (number of parameters among others). The most recent version of the model is always used. The model output contains information as to the model version. To create semantic embeddings, please use `luminous-base`.
211    pub model: String,
212
213    /// Possible values: [aleph-alpha, None]
214    /// Optional parameter that specifies which datacenters may process the request. You can either set the
215    /// parameter to "aleph-alpha" or omit it (defaulting to null).
216    /// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
217    /// request in our own datacenters and on servers hosted with other providers. Choose this option for
218    /// maximum availability.
219    /// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
220    /// option for maximal data privacy.
221    #[serde(skip_serializing_if = "Option::is_none")]
222    pub hosting: Option<Hosting>,
223
224    /// This field is used to send prompts to the model. A prompt can either be a text prompt or a multimodal prompt. A text prompt is a string of text. A multimodal prompt is an array of prompt items. It can be a combination of text, images, and token ID arrays.
225    pub prompts: Vec<Prompt>,
226
227    /// Type of embedding representation to embed the prompt with.
228    pub representation: EmbeddingRepresentation,
229
230    /// The default behavior is to return the full embedding with 5120 dimensions. With this parameter you can compress the returned embedding to 128 dimensions.
231    /// The compression is expected to result in a small drop in accuracy performance (4-6%), with the benefit of being much smaller, which makes comparing these embeddings much faster for use cases where speed is critical.
232    /// With the compressed embedding can also perform better if you are embedding really short texts or documents.
233    #[serde(skip_serializing_if = "Option::is_none")]
234    pub compress_to_size: Option<i32>,
235
236    /// Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric.
237    #[serde(skip_serializing_if = "Option::is_none")]
238    pub normalize: Option<bool>,
239
240    /// If set to `null`, attention control parameters only apply to those tokens that have explicitly been set in the request.
241    /// If set to a non-null value, we apply the control parameters to similar tokens as well.
242    /// Controls that have been applied to one token will then be applied to all other tokens
243    /// that have at least the similarity score defined by this parameter.
244    /// The similarity score is the cosine similarity of token embeddings.
245    #[serde(skip_serializing_if = "Option::is_none")]
246    pub contextual_control_threshold: Option<f64>,
247
248    /// `true`: apply controls on prompt items by adding the `log(control_factor)` to attention scores.
249    /// `false`: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub control_log_additive: Option<bool>,
252}
253
254impl_builder_methods!(
255    BatchSemanticEmbeddingRequest,
256    hosting: Hosting,
257    compress_to_size: i32,
258    normalize: bool,
259    contextual_control_threshold: f64,
260    control_log_additive: bool
261);
262
263#[derive(Deserialize, Debug)]
264pub struct BatchSemanticEmbeddingResponse {
265    /// model name and version (if any) of the used model for inference
266    pub model_version: String,
267
268    /// Vector of embeddings (one fore each prompt)
269    pub embeddings: Vec<Embedding>,
270}