alith_interface/requests/
res_components.rs

1use super::completion::request::CompletionRequest;
2use crate::llms::api::{
3    anthropic::completion::AnthropicCompletionResponse,
4    openai::completion::OpenAICompletionResponse,
5};
6
7/// The log probability of the completion.
8#[derive(Debug)]
9pub struct InferenceProbabilities {
10    /// The token selected by the model.
11    pub content: Option<String>,
12    /// An array of length n_probs.
13    pub top_probs: Vec<TopProbabilities>,
14}
15
16#[derive(Debug)]
17pub struct TopProbabilities {
18    /// The token.
19    pub token: String,
20    /// The log probability of this token.
21    pub prob: f32,
22}
23
24/// The settings used to generate the completion.
25pub struct GenerationSettings {
26    /// The model used
27    pub model: String,
28    // pub prompt: String, // Need to think how to handle tokens vs. text
29    pub frequency_penalty: Option<f32>,
30    pub presence_penalty: f32,
31    pub temperature: f32,
32    pub top_p: Option<f32>,
33    /// The number of choices to generate.
34    pub n_choices: u8,
35    /// The number of tokens to predict same as max_tokens.
36    pub n_predict: Option<i32>,
37    /// The maxium context size of the model or server setting.
38    pub n_ctx: u64,
39    pub logit_bias: Option<Vec<Vec<serde_json::Value>>>,
40    pub grammar: Option<String>,
41    pub stop_sequences: Vec<String>, // change toi vec of stop sequences
42}
43
44impl GenerationSettings {
45    pub fn new_from_openai(req: &CompletionRequest, res: &OpenAICompletionResponse) -> Self {
46        Self {
47            model: res.model.to_owned(),
48            frequency_penalty: req.config.frequency_penalty,
49            presence_penalty: req.config.presence_penalty,
50            temperature: req.config.temperature,
51            top_p: req.config.top_p,
52            n_choices: 1,
53            n_predict: req.config.actual_request_tokens.map(|x| x as i32),
54            n_ctx: req.config.inference_ctx_size,
55            logit_bias: None,
56            grammar: None,
57            stop_sequences: req
58                .stop_sequences
59                .sequences
60                .iter()
61                .map(|x| x.as_str().to_owned())
62                .collect(),
63        }
64    }
65
66    pub fn new_from_anthropic(req: &CompletionRequest, res: &AnthropicCompletionResponse) -> Self {
67        Self {
68            model: res.model.to_string(),
69            frequency_penalty: req.config.frequency_penalty,
70            presence_penalty: req.config.presence_penalty,
71            temperature: req.config.temperature,
72            top_p: req.config.top_p,
73            n_choices: 1,
74            n_predict: req.config.actual_request_tokens.map(|x| x as i32),
75            n_ctx: req.config.inference_ctx_size,
76            logit_bias: None,
77            grammar: None,
78            stop_sequences: req
79                .stop_sequences
80                .sequences
81                .iter()
82                .map(|x| x.as_str().to_owned())
83                .collect(),
84        }
85    }
86}
87
88impl std::fmt::Display for GenerationSettings {
89    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
90        writeln!(f)?;
91        writeln!(f, "    model: {:?}", self.model)?;
92        writeln!(f, "    frequency_penalty: {:?}", self.frequency_penalty)?;
93        writeln!(f, "    presence_penalty: {:?}", self.presence_penalty)?;
94        writeln!(f, "    temperature: {:?}", self.temperature)?;
95        writeln!(f, "    top_p: {:?}", self.top_p)?;
96        writeln!(f, "    n_choices: {:?}", self.n_choices)?;
97        writeln!(f, "    n_predict: {:?}", self.n_predict)?;
98        writeln!(f, "    n_ctx: {:?}", self.n_ctx)?;
99        writeln!(f, "    logit_bias: {:?}", self.logit_bias)?;
100        writeln!(f, "    grammar: {:?}", self.grammar)?;
101        writeln!(f, "    stop_sequences: {:?}", self.stop_sequences)
102    }
103}
104
105/// Timing statistics for the completion request.
106pub struct TimingUsage {
107    /// Timestamp of when the request was created.
108    pub start_time: std::time::Instant,
109    /// Timestamp of when the request was completed.
110    pub end_time: std::time::Instant,
111    /// Total time taken to process the request.
112    pub total_time: std::time::Duration,
113    /// Time taken to process the prompt.
114    pub prompt_processing_t: Option<std::time::Duration>,
115    /// Time taken to generate the completion.
116    pub generation_t: Option<std::time::Duration>,
117    /// Number of prompt tokens processed per millisecond.
118    pub prompt_tok_per_ms: Option<f32>,
119    /// Number of prompt tokens processed per second.
120    pub prompt_tok_per_sec: Option<f32>,
121    /// Number of tokens generated per millisecond.
122    pub generation_tok_per_ms: Option<f32>,
123    /// Number of tokens generated per second.
124    pub generation_tok_per_sec: Option<f32>,
125}
126
127impl TimingUsage {
128    pub fn new_from_generic(start_time: std::time::Instant) -> Self {
129        Self {
130            total_time: start_time.elapsed(),
131            start_time,
132            end_time: std::time::Instant::now(),
133            prompt_processing_t: None,
134            generation_t: None,
135            prompt_tok_per_ms: None,
136            prompt_tok_per_sec: None,
137            generation_tok_per_ms: None,
138            generation_tok_per_sec: None,
139        }
140    }
141}
142
143impl std::fmt::Display for TimingUsage {
144    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
145        writeln!(f)?;
146        writeln!(f, "    total_time: {:?}", self.total_time)?;
147        writeln!(f, "    prompt_processing_t: {:?}", self.prompt_processing_t)?;
148        writeln!(f, "    generation_t: {:?}", self.generation_t)?;
149        writeln!(f, "    prompt_tok_per_ms: {:?}", self.prompt_tok_per_ms)?;
150        writeln!(f, "    prompt_tok_per_sec: {:?}", self.prompt_tok_per_sec)?;
151        writeln!(
152            f,
153            "    generation_tok_per_ms: {:?}",
154            self.generation_tok_per_ms
155        )?;
156        writeln!(
157            f,
158            "    generation_tok_per_sec: {:?}",
159            self.generation_tok_per_sec
160        )
161    }
162}
163
164/// Token statistics for the completion request.
165pub struct TokenUsage {
166    /// Number of tokens from the prompt which could be re-used from previous completion (n_past)
167    pub tokens_cached: Option<u32>,
168    /// Number of tokens evaluated in total from the prompt. Same as tokens_evaluated.
169    pub prompt_tokens: u32,
170    /// Number of tokens in the generated completion. Same as predicted_n.
171    pub completion_tokens: u32,
172    /// Total number of tokens used in the request (prompt + completion).
173    pub total_tokens: u32,
174    /// Dollar cost of the request.
175    pub dollar_cost: Option<f32>,
176    /// Cents cost of the request.
177    pub cents_cost: Option<f32>,
178}
179
180impl TokenUsage {
181    pub fn new_from_generic(res: &OpenAICompletionResponse) -> Self {
182        if let Some(usage) = &res.usage {
183            Self {
184                tokens_cached: None,
185                prompt_tokens: usage.prompt_tokens,
186                completion_tokens: usage.completion_tokens,
187                total_tokens: usage.total_tokens,
188                dollar_cost: None,
189                cents_cost: None,
190            }
191        } else {
192            Self {
193                tokens_cached: None,
194                prompt_tokens: 0,
195                completion_tokens: 0,
196                total_tokens: 0,
197                dollar_cost: None,
198                cents_cost: None,
199            }
200        }
201    }
202
203    pub fn new_from_anthropic(res: &AnthropicCompletionResponse) -> Self {
204        Self {
205            tokens_cached: None,
206            prompt_tokens: res.usage.input_tokens,
207            completion_tokens: res.usage.output_tokens,
208            total_tokens: res.usage.input_tokens + res.usage.output_tokens,
209            dollar_cost: None,
210            cents_cost: None,
211        }
212    }
213}
214
215impl std::fmt::Display for TokenUsage {
216    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
217        writeln!(f)?;
218        writeln!(f, "    tokens_cached: {:?}", self.tokens_cached)?;
219        writeln!(f, "    prompt_tokens: {:?}", self.prompt_tokens)?;
220        writeln!(f, "    completion_tokens: {:?}", self.completion_tokens)?;
221        writeln!(f, "    total_tokens: {:?}", self.total_tokens)?;
222        writeln!(f, "    dollar_cost: {:?}", self.dollar_cost)?;
223        writeln!(f, "    cents_cost: {:?}", self.cents_cost)
224    }
225}