1#[derive(Debug, Clone, Copy)]
6pub struct TokenBudget {
7 pub us_per_token: f64,
9 pub tokens_per_sec: f64,
11 pub batch_size: usize,
13}
14
15#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
20pub struct ByteBudget {
21 pub us_per_page: f64,
23 pub gb_per_sec: f64,
25 pub page_size: usize,
27}
28
29impl Default for ByteBudget {
30 fn default() -> Self {
31 Self::from_throughput(25.0)
33 }
34}
35
36impl ByteBudget {
37 pub fn from_throughput(gb_per_sec: f64) -> Self {
40 debug_assert!(
41 gb_per_sec > 0.0 && gb_per_sec.is_finite(),
42 "CB-BUDGET: throughput must be positive and finite, got {}",
43 gb_per_sec
44 );
45 let bytes_per_sec = gb_per_sec * 1e9;
46 let pages_per_sec = bytes_per_sec / 4096.0;
47 Self { us_per_page: 1_000_000.0 / pages_per_sec, gb_per_sec, page_size: 4096 }
48 }
49
50 pub fn from_latency(us_per_page: f64) -> Self {
52 let pages_per_sec = 1_000_000.0 / us_per_page;
53 let bytes_per_sec = pages_per_sec * 4096.0;
54 Self { us_per_page, gb_per_sec: bytes_per_sec / 1e9, page_size: 4096 }
55 }
56
57 #[must_use]
59 pub fn with_page_size(mut self, page_size: usize) -> Self {
60 let bytes_per_sec = self.gb_per_sec * 1e9;
62 let pages_per_sec = bytes_per_sec / page_size as f64;
63 self.us_per_page = 1_000_000.0 / pages_per_sec;
64 self.page_size = page_size;
65 self
66 }
67
68 pub fn to_token_budget(&self) -> TokenBudget {
71 TokenBudget {
72 us_per_token: self.us_per_page,
73 tokens_per_sec: 1_000_000.0 / self.us_per_page,
74 batch_size: 1,
75 }
76 }
77
78 pub fn is_met(&self, actual_us_per_page: f64) -> bool {
80 actual_us_per_page <= self.us_per_page
81 }
82
83 pub fn utilization(&self, actual_us_per_page: f64) -> f64 {
85 actual_us_per_page / self.us_per_page
86 }
87
88 pub fn throughput_from_latency(us_per_page: f64, page_size: usize) -> f64 {
90 let pages_per_sec = 1_000_000.0 / us_per_page;
91 pages_per_sec * page_size as f64 / 1e9
92 }
93}
94
95impl Default for TokenBudget {
96 fn default() -> Self {
97 Self::from_latency(50.0)
99 }
100}
101
102impl TokenBudget {
103 pub fn from_latency(us_per_token: f64) -> Self {
106 Self { us_per_token, tokens_per_sec: 1_000_000.0 / us_per_token, batch_size: 1 }
107 }
108
109 pub fn from_throughput(tokens_per_sec: f64) -> Self {
112 Self { us_per_token: 1_000_000.0 / tokens_per_sec, tokens_per_sec, batch_size: 1 }
113 }
114
115 #[must_use]
117 pub fn with_batch_size(mut self, batch_size: usize) -> Self {
118 self.batch_size = batch_size.max(1);
119 self
120 }
121
122 pub fn is_met(&self, actual_us_per_token: f64) -> bool {
124 actual_us_per_token <= self.us_per_token
125 }
126
127 pub fn utilization(&self, actual_us_per_token: f64) -> f64 {
129 actual_us_per_token / self.us_per_token
130 }
131}
132
133#[derive(Debug, Clone)]
135pub struct TokenResult<T> {
136 pub output: T,
138 pub tokens_processed: usize,
140 pub us_per_token: f64,
142 pub tokens_per_sec: f64,
144 pub budget_met: bool,
146 pub budget_utilization: f64,
148}
149
150impl<T> TokenResult<T> {
151 pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> TokenResult<U> {
153 TokenResult {
154 output: f(self.output),
155 tokens_processed: self.tokens_processed,
156 us_per_token: self.us_per_token,
157 tokens_per_sec: self.tokens_per_sec,
158 budget_met: self.budget_met,
159 budget_utilization: self.budget_utilization,
160 }
161 }
162}