1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
//! Performance Optimizations with Caching and Prefetching
//!
//! Implements semantic caching with <500ms response for cached operations
//! and context prefetching for likely next operations
use anyhow::Result;
use dashmap::DashMap;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tokio::time::{sleep, Duration};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheEntry {
pub value: String,
pub timestamp: std::time::SystemTime,
pub access_count: u32,
pub estimated_tokens: usize,
}
pub struct LatencyOptimizer {
/// LRU cache for model outputs (semantic deduplication)
cache: Arc<DashMap<String, CacheEntry>>,
/// Prefetches context for likely next operations
prefetcher: Arc<ContextPrefetcher>,
/// Maximum cache size
max_cache_size: usize,
}
impl LatencyOptimizer {
pub fn new(max_cache_size: usize) -> Self {
Self {
cache: Arc::new(DashMap::new()),
prefetcher: Arc::new(ContextPrefetcher::new()),
max_cache_size,
}
}
pub async fn complete_with_cache(&self, prompt: &str) -> Result<String> {
let hash = self.semantic_hash(prompt);
// Check if cached
if let Some(cached) = self.cache.get(&hash) {
// Update access count
let mut entry = cached.clone();
entry.access_count += 1;
self.cache.insert(hash.clone(), entry);
return Ok(cached.value.clone());
}
// Prefetch while generating (in the background)
let prefetch_handle = self.prefetcher.prefetch_next_context(prompt);
// Simulate model completion (in a real implementation, this would call the AI model)
let result = self.simulate_model_completion(prompt).await?;
// Store in cache
let cache_entry = CacheEntry {
value: result.clone(),
timestamp: std::time::SystemTime::now(),
access_count: 1,
estimated_tokens: prompt.len() / 4, // Rough token estimation
};
self.cache.insert(hash, cache_entry);
// Wait for prefetch to complete (non-blocking in real usage)
let _ = prefetch_handle.await;
// Apply cache size limit
self.maintain_cache_size();
Ok(result)
}
fn semantic_hash(&self, prompt: &str) -> String {
// Create a hash that captures semantic meaning rather than exact text
// In a real implementation, this would use embeddings or more sophisticated techniques
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
// Normalize the prompt for semantic similarity
let normalized = prompt
.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
normalized.hash(&mut hasher);
format!("{:x}", hasher.finish())
}
async fn simulate_model_completion(&self, prompt: &str) -> Result<String> {
// Simulate the time it takes to get a response from a model
// In a real implementation, this would call the actual AI provider
sleep(Duration::from_millis(100)).await; // Simulated API delay
Ok(format!("Response to: {}", prompt))
}
fn maintain_cache_size(&self) {
// Remove least recently used items if cache is too large
if self.cache.len() > self.max_cache_size {
// Convert to vector and sort by access count and recency
let mut entries: Vec<_> = self
.cache
.iter()
.map(|entry| (entry.key().clone(), entry.value().clone()))
.collect();
// Sort by access count (ascending) and then by time (oldest first)
entries.sort_by(|a, b| {
a.1.access_count.cmp(&b.1.access_count).then(
a.1.timestamp
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.cmp(
&b.1.timestamp
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default(),
),
)
});
// Remove the oldest/least accessed entries
let to_remove = entries.len() - self.max_cache_size + 10; // Keep 10% buffer
for i in 0..to_remove.min(entries.len()) {
self.cache.remove(&entries[i].0);
}
}
}
pub fn get_cache_stats(&self) -> (usize, usize) {
(self.cache.len(), self.max_cache_size)
}
}
pub struct ContextPrefetcher {
/// Simulates prefetching for likely next operations
prefetch_tasks: Arc<DashMap<String, String>>,
}
impl ContextPrefetcher {
pub fn new() -> Self {
Self {
prefetch_tasks: Arc::new(DashMap::new()),
}
}
pub async fn prefetch_next_context(
&self,
current_prompt: &str,
) -> tokio::task::JoinHandle<Result<()>> {
let prefetch_tasks = Arc::clone(&self.prefetch_tasks);
let current_prompt = current_prompt.to_string();
// Spawn a background task to prefetch likely next context
tokio::spawn(async move {
// In a real implementation, this would analyze the current prompt
// to predict what context will be needed next and prefetch it
let next_context_key = format!("next_{}", current_prompt);
// Simulate prefetching work
sleep(Duration::from_millis(50)).await;
// Store the prefetched context
prefetch_tasks.insert(next_context_key, "prefetched_context".to_string());
Ok(())
})
}
pub async fn get_prefetched_context(&self, key: &str) -> Option<String> {
self.prefetch_tasks.get(key).map(|v| v.value().clone())
}
}
pub struct TokenEstimator;
impl TokenEstimator {
pub fn estimate_tokens(&self, text: &str) -> usize {
// Rough estimation: 1 token ≈ 4 characters for English text
// In a real implementation, we'd use actual tokenizers for specific models
(text.len() / 4).max(1)
}
pub fn estimate_tokens_exact(&self, text: &str) -> usize {
// More precise estimation that counts words, punctuation, etc.
let mut tokens = 0;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
match ch {
c if c.is_alphanumeric() => {
// Count alphanumeric sequences as tokens
while chars.peek().map_or(false, |&c| c.is_alphanumeric()) {
chars.next();
}
tokens += 1;
}
c if c.is_whitespace() => {
// Skip whitespace
}
c if c.is_ascii_punctuation() => {
// Punctuation is typically separate tokens
tokens += 1;
}
_ => {
// Other characters
tokens += 1;
}
}
}
tokens.max(1)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_latency_optimizer_cache() {
let optimizer = LatencyOptimizer::new(100);
// First call should take longer (simulate model call)
let result1 = optimizer.complete_with_cache("test prompt").await.unwrap();
assert!(result1.contains("test prompt"));
// Second call should be faster (cached)
let result2 = optimizer.complete_with_cache("test prompt").await.unwrap();
assert_eq!(result1, result2);
// Check cache stats
let (current_size, max_size) = optimizer.get_cache_stats();
assert_eq!(current_size, 1);
assert_eq!(max_size, 100);
}
#[test]
fn test_semantic_hash() {
let optimizer = LatencyOptimizer::new(100);
// Similar prompts should have same hash (after normalization)
let hash1 = optimizer.semantic_hash("Hello world");
let hash2 = optimizer.semantic_hash("Hello world"); // extra space
let hash3 = optimizer.semantic_hash("Goodbye world");
assert_eq!(hash1, hash2); // Same semantic meaning
assert_ne!(hash1, hash3); // Different meaning
}
#[test]
fn test_token_estimator() {
let estimator = TokenEstimator;
let tokens = estimator.estimate_tokens("Hello world!");
assert!(tokens > 0);
let exact_tokens = estimator.estimate_tokens_exact("Hello world!");
assert!(exact_tokens > 0);
assert!(exact_tokens >= tokens);
}
}