oxibonsai_runtime/
token_budget.rs1use std::sync::atomic::{AtomicU64, Ordering};
4use std::sync::Arc;
5
6use thiserror::Error;
7
8#[derive(Debug, Error)]
12pub enum BudgetError {
13 #[error("prompt tokens {prompt} exceeds max_prompt_tokens {max}")]
14 PromptTooLong { prompt: usize, max: usize },
15 #[error("completion token budget exhausted (limit = {limit})")]
16 CompletionBudgetExhausted { limit: usize },
17 #[error("total token budget exhausted (limit = {limit}, used = {used})")]
18 TotalBudgetExhausted { limit: usize, used: usize },
19}
20
21#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum BudgetPolicy {
26 StopGeneration,
28 TruncateContext,
30 ReturnError,
32}
33
34#[derive(Debug, Clone)]
38pub struct BudgetConfig {
39 pub max_prompt_tokens: Option<usize>,
41 pub max_completion_tokens: Option<usize>,
43 pub max_total_tokens: Option<usize>,
45 pub policy: BudgetPolicy,
47}
48
49impl BudgetConfig {
50 pub fn new() -> Self {
52 Self {
53 max_prompt_tokens: None,
54 max_completion_tokens: None,
55 max_total_tokens: None,
56 policy: BudgetPolicy::StopGeneration,
57 }
58 }
59
60 pub fn with_max_completion(mut self, n: usize) -> Self {
62 self.max_completion_tokens = Some(n);
63 self
64 }
65
66 pub fn with_max_total(mut self, n: usize) -> Self {
68 self.max_total_tokens = Some(n);
69 self
70 }
71
72 pub fn with_policy(mut self, policy: BudgetPolicy) -> Self {
74 self.policy = policy;
75 self
76 }
77
78 pub fn unlimited() -> Self {
80 Self::new()
81 }
82}
83
84impl Default for BudgetConfig {
85 fn default() -> Self {
86 Self::new()
87 }
88}
89
90#[derive(Debug)]
94pub struct RequestBudget {
95 config: BudgetConfig,
96 prompt_tokens: usize,
97 completion_tokens: usize,
98}
99
100impl RequestBudget {
101 pub fn new(config: BudgetConfig, prompt_tokens: usize) -> Result<Self, BudgetError> {
106 if let Some(max) = config.max_prompt_tokens {
107 if prompt_tokens > max {
108 return Err(BudgetError::PromptTooLong {
109 prompt: prompt_tokens,
110 max,
111 });
112 }
113 }
114 Ok(Self {
115 config,
116 prompt_tokens,
117 completion_tokens: 0,
118 })
119 }
120
121 pub fn record_token(&mut self) -> Result<(), BudgetError> {
126 self.record_tokens(1)
127 }
128
129 pub fn record_tokens(&mut self, n: usize) -> Result<(), BudgetError> {
133 self.completion_tokens = self.completion_tokens.saturating_add(n);
134
135 if let Some(limit) = self.config.max_completion_tokens {
137 if self.completion_tokens > limit {
138 return Err(BudgetError::CompletionBudgetExhausted { limit });
139 }
140 }
141
142 if let Some(limit) = self.config.max_total_tokens {
144 let used = self.total_tokens();
145 if used > limit {
146 return Err(BudgetError::TotalBudgetExhausted { limit, used });
147 }
148 }
149
150 Ok(())
151 }
152
153 pub fn prompt_tokens(&self) -> usize {
155 self.prompt_tokens
156 }
157
158 pub fn completion_tokens(&self) -> usize {
160 self.completion_tokens
161 }
162
163 pub fn total_tokens(&self) -> usize {
165 self.prompt_tokens.saturating_add(self.completion_tokens)
166 }
167
168 pub fn remaining_completion_tokens(&self) -> Option<usize> {
170 self.config
171 .max_completion_tokens
172 .map(|limit| limit.saturating_sub(self.completion_tokens))
173 }
174
175 pub fn is_exhausted(&self) -> bool {
177 if let Some(limit) = self.config.max_completion_tokens {
178 if self.completion_tokens >= limit {
179 return true;
180 }
181 }
182 if let Some(limit) = self.config.max_total_tokens {
183 if self.total_tokens() >= limit {
184 return true;
185 }
186 }
187 false
188 }
189
190 pub fn policy(&self) -> BudgetPolicy {
192 self.config.policy
193 }
194}
195
196pub struct GlobalTokenBudget {
200 total_tokens_used: Arc<AtomicU64>,
201 max_tokens: Option<u64>,
202}
203
204impl GlobalTokenBudget {
205 pub fn new(max_tokens: Option<u64>) -> Self {
207 Self {
208 total_tokens_used: Arc::new(AtomicU64::new(0)),
209 max_tokens,
210 }
211 }
212
213 pub fn unlimited() -> Self {
215 Self::new(None)
216 }
217
218 pub fn record(&self, tokens: u64) {
220 self.total_tokens_used.fetch_add(tokens, Ordering::Relaxed);
221 }
222
223 pub fn total_used(&self) -> u64 {
225 self.total_tokens_used.load(Ordering::Relaxed)
226 }
227
228 pub fn remaining(&self) -> Option<u64> {
230 self.max_tokens
231 .map(|cap| cap.saturating_sub(self.total_used()))
232 }
233
234 pub fn is_exhausted(&self) -> bool {
236 match self.max_tokens {
237 None => false,
238 Some(cap) => self.total_used() >= cap,
239 }
240 }
241
242 pub fn utilization(&self) -> Option<f32> {
245 self.max_tokens.map(|cap| {
246 if cap == 0 {
247 1.0
248 } else {
249 self.total_used() as f32 / cap as f32
250 }
251 })
252 }
253}
254
255#[derive(Debug, Clone)]
259pub struct TokenCostEstimate {
260 pub prompt_tokens: usize,
262 pub completion_tokens: usize,
264 pub prompt_cost: f64,
266 pub completion_cost: f64,
268 pub total_cost: f64,
270}
271
272impl TokenCostEstimate {
273 pub fn compute(
275 prompt_tokens: usize,
276 completion_tokens: usize,
277 prompt_cost_per_1k: f64,
278 completion_cost_per_1k: f64,
279 ) -> Self {
280 let prompt_cost = prompt_tokens as f64 / 1_000.0 * prompt_cost_per_1k;
281 let completion_cost = completion_tokens as f64 / 1_000.0 * completion_cost_per_1k;
282 let total_cost = prompt_cost + completion_cost;
283 Self {
284 prompt_tokens,
285 completion_tokens,
286 prompt_cost,
287 completion_cost,
288 total_cost,
289 }
290 }
291
292 pub fn summary(&self) -> String {
294 format!(
295 "tokens: prompt={} completion={} | cost: prompt=${:.6} completion=${:.6} total=${:.6}",
296 self.prompt_tokens,
297 self.completion_tokens,
298 self.prompt_cost,
299 self.completion_cost,
300 self.total_cost,
301 )
302 }
303}