aleph_alpha_api/
evaluate.rs

1use super::completion::{Hosting, Prompt};
2use crate::impl_builder_methods;
3use serde::{Deserialize, Serialize};
4
5#[derive(Serialize, Debug, Default)]
6pub struct EvaluationRequest {
7    pub model: String,
8
9    /// Base prompt to for the evaluation.
10    pub prompt: Prompt,
11
12    /// Possible values: [aleph-alpha, None]
13    /// Optional parameter that specifies which datacenters may process the request. You can either set the
14    /// parameter to "aleph-alpha" or omit it (defaulting to null).
15    /// Not setting this value, or setting it to None, gives us maximal flexibility in processing your
16    /// request in our own datacenters and on servers hosted with other providers. Choose this option for
17    /// maximum availability.
18    /// Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. Choose this
19    /// option for maximal data privacy.
20    #[serde(skip_serializing_if = "Option::is_none")]
21    pub hosting: Option<Hosting>,
22
23    /// The completion that you would expect to be completed. Unconditional completion can be used with an
24    /// empty string (default). The prompt may contain a zero shot or few shot task.
25    pub completion_expected: String,
26
27    /// If set to `None`, attention control parameters only apply to those tokens that have explicitly been set
28    /// in the request. If set to a non-null value, we apply the control parameters to similar tokens as
29    /// well. Controls that have been applied to one token will then be applied to all other tokens that have
30    /// at least the similarity score defined by this parameter. The similarity score is the cosine
31    /// similarity of token embeddings.
32    #[serde(skip_serializing_if = "Option::is_none")]
33    pub contextual_control_threshold: Option<f64>,
34
35    /// Default value: true
36    /// true: apply controls on prompt items by adding the `log(control_factor)`` to attention scores.
37    /// false: apply controls on prompt items by `(attention_scores - -attention_scores.min(-1)) * control_factor`
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub control_log_additive: Option<bool>,
40}
41
42impl EvaluationRequest {
43    pub fn from_text(
44        model: impl Into<String>,
45        prompt: impl Into<String>,
46        completion_expected: impl Into<String>,
47    ) -> Self {
48        Self {
49            model: model.into(),
50            prompt: Prompt::from_text(prompt),
51            completion_expected: completion_expected.into(),
52            ..Self::default()
53        }
54    }
55}
56
57impl_builder_methods!(
58    EvaluationRequest,
59    hosting: Hosting,
60    contextual_control_threshold: f64,
61    control_log_additive: bool
62);
63
64#[derive(Deserialize, Debug)]
65pub struct EvaluationResponse {
66    /// model name and version (if any) of the used model for inference
67    pub model_version: String,
68
69    /// object with result metrics of the evaluation
70    pub result: EvaluationResult,
71}
72
73#[derive(Deserialize, Debug)]
74pub struct EvaluationResult {
75    /// log probability of producing the expected completion given the prompt. This metric refers to all tokens and is therefore dependent on the used tokenizer. It cannot be directly compared among models with different tokenizers.
76    pub log_probability: Option<f64>,
77
78    /// log perplexity associated with the expected completion given the prompt. This metric refers to all tokens and is therefore dependent on the used tokenizer. It cannot be directly compared among models with different tokenizers.
79    pub log_perplexity: Option<f64>,
80
81    /// log perplexity associated with the expected completion given the prompt normalized for the number of tokens. This metric computes an average per token and is therefore dependent on the used tokenizer. It cannot be directly compared among models with different tokenizers.
82    pub log_perplexity_per_token: Option<f64>,
83
84    /// log perplexity associated with the expected completion given the prompt normalized for the number of characters. This metric is independent of any tokenizer. It can be directly compared among models with different tokenizers.
85    pub log_perplexity_per_character: Option<f64>,
86
87    /// Flag indicating whether a greedy completion would have produced the expected completion.
88    pub correct_greedy: Option<bool>,
89
90    /// Number of tokens in the expected completion.
91    pub token_count: Option<i32>,
92
93    /// Number of characters in the expected completion.
94    pub character_count: Option<i32>,
95
96    /// argmax completion given the input consisting of prompt and expected completion. This may be used as an indicator of what the model would have produced. As only one single forward is performed an incoherent text could be produced especially for long expected completions.
97    pub completion: Option<String>,
98}