1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
use super::completion::{Hosting, Prompt};
use crate::impl_builder_methods;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Serialize, Debug, Default)]
pub struct EmbeddingRequest {
/// Name of model to use. A model name refers to a model architecture (number of parameters among others). Always the latest version of model is used. The model output contains information as to the model version.
pub model: String,
/// Possible values: [aleph-alpha, None]
/// Optional parameter that specifies which datacenters may process the request. You can either set the
/// parameter to "aleph-alpha" or omit it (defaulting to null).
/// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
/// request in our own datacenters and on servers hosted with other providers. Choose this option for
/// maximum availability.
/// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
/// option for maximal data privacy.
#[serde(skip_serializing_if = "Option::is_none")]
pub hosting: Option<Hosting>,
/// This field is used to send prompts to the model. A prompt can either be a text prompt or a multimodal prompt. A text prompt is a string of text. A multimodal prompt is an array of prompt items. It can be a combination of text, images, and token ID arrays.
/// In the case of a multimodal prompt, the prompt items will be concatenated and a single prompt will be used for the model.
/// Tokenization:
/// Token ID arrays are used as as-is.
/// Text prompt items are tokenized using the tokenizers specific to the model.
/// Each image is converted into 144 tokens.
pub prompt: Prompt,
/// A list of layer indices from which to return embeddings.
/// - Index 0 corresponds to the word embeddings used as input to the first transformer layer
/// - Index 1 corresponds to the hidden state as output by the first transformer layer, index 2 to the output of the second layer etc.
/// - Index -1 corresponds to the last transformer layer (not the language modelling head), index -2 to the second last
pub layers: Vec<i32>,
/// Flag indicating whether the tokenized prompt is to be returned (True) or not (False)
#[serde(skip_serializing_if = "Option::is_none")]
pub tokens: Option<bool>,
/// Pooling operation to use. Pooling operations include:
/// - "mean": Aggregate token embeddings across the sequence dimension using an average.
/// - "weighted_mean": Position weighted mean across sequence dimension with latter tokens having a higher weight.
/// - "max": Aggregate token embeddings across the sequence dimension using a maximum.
/// - "last_token": Use the last token.
/// - "abs_max": Aggregate token embeddings across the sequence dimension using a maximum of absolute values.
pub pooling: Vec<String>,
/// Explicitly set embedding type to be passed to the model. This parameter was created to allow for semantic_embed embeddings and will be deprecated. Please use the semantic_embed-endpoint instead.
#[serde(rename = "type", skip_serializing_if = "Option::is_none")]
// type is a reserved word in Rust
pub embedding_type: Option<String>,
/// Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric.
#[serde(skip_serializing_if = "Option::is_none")]
pub normalize: Option<bool>,
/// If set to `None`, attention control parameters only apply to those tokens that have explicitly been set
/// in the request. If set to a non-null value, we apply the control parameters to similar tokens as
/// well. Controls that have been applied to one token will then be applied to all other tokens that have
/// at least the similarity score defined by this parameter. The similarity score is the cosine
/// similarity of token embeddings.
#[serde(skip_serializing_if = "Option::is_none")]
pub contextual_control_threshold: Option<f64>,
/// `true`: apply controls on prompt items by adding the `log(control_factor)` to attention scores.
/// `false`: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
#[serde(skip_serializing_if = "Option::is_none")]
pub control_log_additive: Option<bool>,
}
impl EmbeddingRequest {
pub fn from_text(
model: impl Into<String>,
prompt: impl Into<String>,
layer: i32,
pooling: impl Into<String>,
normalize: bool,
) -> Self {
Self {
model: model.into(),
prompt: Prompt::from_text(prompt),
layers: vec![layer.into()],
pooling: vec![pooling.into()],
normalize: Some(normalize),
..Self::default()
}
}
}
impl_builder_methods!(
EmbeddingRequest,
tokens: bool,
embedding_type: String,
normalize: bool,
contextual_control_threshold: f64,
control_log_additive: bool
);
type Embedding = Vec<f32>;
type PoolingEmbeddings = HashMap<String, Embedding>;
type LayerEmbedings = HashMap<String, PoolingEmbeddings>;
#[derive(Deserialize, Debug)]
pub struct EmbeddingResponse {
/// model name and version (if any) of the used model for inference
pub model_version: String,
/// embeddings:
/// - pooling: a dict with layer names as keys and and pooling output as values. A pooling output is a dict with pooling operation as key and a pooled embedding (list of floats) as values
pub embeddings: LayerEmbedings,
pub tokens: Option<Vec<String>>,
}
/// Type of embedding representation to embed the prompt with.
///
/// `"symmetric"`: Symmetric embeddings assume that the text to be compared is interchangeable. Usage examples for symmetric embeddings are clustering, classification, anomaly detection or visualisation tasks. "symmetric" embeddings should be compared with other "symmetric" embeddings.
///
/// `"document"` and `"query"`: Asymmetric embeddings assume that there is a difference between queries and documents. They are used together in use cases such as search where you want to compare shorter queries against larger documents.
///
/// `"query"`-embeddings are optimized for shorter texts, such as questions or keywords.
///
/// `"document"`-embeddings are optimized for larger pieces of text to compare queries against.
#[derive(Serialize, Debug)]
#[serde(rename_all = "snake_case")]
pub enum EmbeddingRepresentation {
Symmetric,
Document,
Query,
}
impl Default for EmbeddingRepresentation {
fn default() -> Self {
EmbeddingRepresentation::Symmetric
}
}
/// Embeds a prompt using a specific model and semantic embedding method. Resulting vectors that can be used for downstream tasks (e.g. semantic similarity) and models (e.g. classifiers).
#[derive(Serialize, Debug, Default)]
pub struct SemanticEmbeddingRequest {
/// Name of the model to use. A model name refers to a model's architecture (number of parameters among others). The most recent version of the model is always used. The model output contains information as to the model version. To create semantic embeddings, please use `luminous-base`.
pub model: String,
/// Possible values: [aleph-alpha, None]
/// Optional parameter that specifies which datacenters may process the request. You can either set the
/// parameter to "aleph-alpha" or omit it (defaulting to null).
/// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
/// request in our own datacenters and on servers hosted with other providers. Choose this option for
/// maximum availability.
/// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
/// option for maximal data privacy.
#[serde(skip_serializing_if = "Option::is_none")]
pub hosting: Option<Hosting>,
/// This field is used to send prompts to the model. A prompt can either be a text prompt or a multimodal prompt. A text prompt is a string of text. A multimodal prompt is an array of prompt items. It can be a combination of text, images, and token ID arrays.
/// In the case of a multimodal prompt, the prompt items will be concatenated and a single prompt will be used for the model.
/// Tokenization:
/// Token ID arrays are used as as-is.
/// Text prompt items are tokenized using the tokenizers specific to the model.
/// Each image is converted into 144 tokens.
pub prompt: Prompt,
/// Type of embedding representation to embed the prompt with.
pub representation: EmbeddingRepresentation,
/// The default behavior is to return the full embedding with 5120 dimensions. With this parameter you can compress the returned embedding to 128 dimensions.
/// The compression is expected to result in a small drop in accuracy performance (4-6%), with the benefit of being much smaller, which makes comparing these embeddings much faster for use cases where speed is critical.
/// With the compressed embedding can also perform better if you are embedding really short texts or documents.
#[serde(skip_serializing_if = "Option::is_none")]
pub compress_to_size: Option<i32>,
/// Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric.
#[serde(skip_serializing_if = "Option::is_none")]
pub normalize: Option<bool>,
/// If set to `null`, attention control parameters only apply to those tokens that have explicitly been set in the request.
/// If set to a non-null value, we apply the control parameters to similar tokens as well.
/// Controls that have been applied to one token will then be applied to all other tokens
/// that have at least the similarity score defined by this parameter.
/// The similarity score is the cosine similarity of token embeddings.
#[serde(skip_serializing_if = "Option::is_none")]
pub contextual_control_threshold: Option<f64>,
/// `true`: apply controls on prompt items by adding the `log(control_factor)` to attention scores.
/// `false`: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
#[serde(skip_serializing_if = "Option::is_none")]
pub control_log_additive: Option<bool>,
}
impl_builder_methods!(
SemanticEmbeddingRequest,
hosting: Hosting,
compress_to_size: i32,
normalize: bool,
contextual_control_threshold: f64,
control_log_additive: bool
);
#[derive(Deserialize, Debug)]
pub struct SemanticEmbeddingResponse {
/// model name and version (if any) of the used model for inference
pub model_version: String,
/// A list of floats that can be used to compare against other embeddings.
pub embedding: Embedding,
}
#[derive(Serialize, Debug, Default)]
pub struct BatchSemanticEmbeddingRequest {
/// Name of the model to use. A model name refers to a model's architecture (number of parameters among others). The most recent version of the model is always used. The model output contains information as to the model version. To create semantic embeddings, please use `luminous-base`.
pub model: String,
/// Possible values: [aleph-alpha, None]
/// Optional parameter that specifies which datacenters may process the request. You can either set the
/// parameter to "aleph-alpha" or omit it (defaulting to null).
/// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
/// request in our own datacenters and on servers hosted with other providers. Choose this option for
/// maximum availability.
/// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
/// option for maximal data privacy.
#[serde(skip_serializing_if = "Option::is_none")]
pub hosting: Option<Hosting>,
/// This field is used to send prompts to the model. A prompt can either be a text prompt or a multimodal prompt. A text prompt is a string of text. A multimodal prompt is an array of prompt items. It can be a combination of text, images, and token ID arrays.
pub prompts: Vec<Prompt>,
/// Type of embedding representation to embed the prompt with.
pub representation: EmbeddingRepresentation,
/// The default behavior is to return the full embedding with 5120 dimensions. With this parameter you can compress the returned embedding to 128 dimensions.
/// The compression is expected to result in a small drop in accuracy performance (4-6%), with the benefit of being much smaller, which makes comparing these embeddings much faster for use cases where speed is critical.
/// With the compressed embedding can also perform better if you are embedding really short texts or documents.
#[serde(skip_serializing_if = "Option::is_none")]
pub compress_to_size: Option<i32>,
/// Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric.
#[serde(skip_serializing_if = "Option::is_none")]
pub normalize: Option<bool>,
/// If set to `null`, attention control parameters only apply to those tokens that have explicitly been set in the request.
/// If set to a non-null value, we apply the control parameters to similar tokens as well.
/// Controls that have been applied to one token will then be applied to all other tokens
/// that have at least the similarity score defined by this parameter.
/// The similarity score is the cosine similarity of token embeddings.
#[serde(skip_serializing_if = "Option::is_none")]
pub contextual_control_threshold: Option<f64>,
/// `true`: apply controls on prompt items by adding the `log(control_factor)` to attention scores.
/// `false`: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
#[serde(skip_serializing_if = "Option::is_none")]
pub control_log_additive: Option<bool>,
}
impl_builder_methods!(
BatchSemanticEmbeddingRequest,
hosting: Hosting,
compress_to_size: i32,
normalize: bool,
contextual_control_threshold: f64,
control_log_additive: bool
);
#[derive(Deserialize, Debug)]
pub struct BatchSemanticEmbeddingResponse {
/// model name and version (if any) of the used model for inference
pub model_version: String,
/// Vector of embeddings (one fore each prompt)
pub embeddings: Vec<Embedding>,
}