llm_edge_cache/
lib.rs

1//! Multi-Tier Caching System for LLM Edge Agent
2//!
3//! This module implements a high-performance multi-tier caching system with:
4//! - L1: In-memory cache (Moka) - <1ms latency, TinyLFU eviction
5//! - L2: Distributed cache (Redis) - 1-2ms latency, persistent across instances
6//!
7//! # Architecture
8//!
9//! ```text
10//! Request → L1 Lookup (in-memory)
11//!            ├─ HIT → Return (0.1ms)
12//!            └─ MISS
13//!                ↓
14//!           L2 Lookup (Redis)
15//!            ├─ HIT → Populate L1 + Return (2ms)
16//!            └─ MISS
17//!                ↓
18//!           Provider Execution
19//!                ↓
20//!           Async Write → L1 + L2 (non-blocking)
21//! ```
22//!
23//! # Performance Targets
24//! - L1 Latency: <1ms (typically <100μs)
25//! - L2 Latency: 1-2ms
26//! - Overall Hit Rate: >50% (MVP), >70% (Beta)
27//! - L1 TTL: 5 minutes (default)
28//! - L2 TTL: 1 hour (default)
29
30pub mod key;
31pub mod l1;
32pub mod l2;
33pub mod metrics;
34
35use self::key::{generate_cache_key, CacheableRequest};
36use self::l1::{CachedResponse, L1Cache};
37use self::l2::{create_l2_cache_optional, L2Cache, L2Config};
38use self::metrics::{CacheMetrics, MetricsSnapshot};
39use std::sync::Arc;
40use tracing::{debug, info, warn};
41
42/// Result of a cache lookup operation
43#[derive(Debug, Clone)]
44pub enum CacheLookupResult {
45    /// Cache hit from L1 (in-memory)
46    L1Hit(Arc<CachedResponse>),
47    /// Cache hit from L2 (Redis)
48    L2Hit(Arc<CachedResponse>),
49    /// Cache miss (need to fetch from provider)
50    Miss,
51}
52
53impl CacheLookupResult {
54    pub fn is_hit(&self) -> bool {
55        matches!(self, Self::L1Hit(_) | Self::L2Hit(_))
56    }
57
58    pub fn response(&self) -> Option<Arc<CachedResponse>> {
59        match self {
60            Self::L1Hit(resp) | Self::L2Hit(resp) => Some(Arc::clone(resp)),
61            Self::Miss => None,
62        }
63    }
64}
65
66/// Multi-tier cache orchestrator
67///
68/// This is the main interface for cache operations. It coordinates
69/// lookups and writes across L1 and L2 cache tiers.
70pub struct CacheManager {
71    l1: L1Cache,
72    l2: Option<L2Cache>,
73    metrics: CacheMetrics,
74}
75
76impl CacheManager {
77    /// Create a new cache manager with default L1 and no L2
78    pub fn new() -> Self {
79        let metrics = CacheMetrics::new();
80        let l1 = L1Cache::new(metrics.clone());
81
82        Self {
83            l1,
84            l2: None,
85            metrics,
86        }
87    }
88
89    /// Create a new cache manager with L1 and L2
90    pub async fn with_l2(l2_config: L2Config) -> Self {
91        let metrics = CacheMetrics::new();
92        let l1 = L1Cache::new(metrics.clone());
93        let l2 = create_l2_cache_optional(l2_config, metrics.clone()).await;
94
95        Self { l1, l2, metrics }
96    }
97
98    /// Lookup a request in the cache
99    ///
100    /// # Flow
101    /// 1. Check L1 (in-memory)
102    /// 2. If miss, check L2 (Redis)
103    /// 3. If L2 hit, populate L1
104    /// 4. Return result
105    ///
106    /// # Performance
107    /// - L1 hit: <1ms
108    /// - L2 hit: 1-2ms
109    pub async fn lookup(&self, request: &CacheableRequest) -> CacheLookupResult {
110        let cache_key = generate_cache_key(request);
111
112        // L1 lookup
113        if let Some(response) = self.l1.get(&cache_key).await {
114            debug!("Cache HIT: L1");
115            return CacheLookupResult::L1Hit(response);
116        }
117
118        // L2 lookup (if available)
119        if let Some(ref l2) = self.l2 {
120            match l2.get(&cache_key).await {
121                Ok(Some(response)) => {
122                    debug!("Cache HIT: L2");
123
124                    // Populate L1 asynchronously (fire-and-forget)
125                    let l1_clone = self.l1.clone();
126                    let key_clone = cache_key.clone();
127                    let response_clone = response.clone();
128                    tokio::spawn(async move {
129                        l1_clone.set(key_clone, response_clone).await;
130                    });
131
132                    return CacheLookupResult::L2Hit(Arc::new(response));
133                }
134                Ok(None) => {
135                    debug!("Cache MISS: L2");
136                }
137                Err(e) => {
138                    warn!("L2 cache error during lookup: {}", e);
139                }
140            }
141        }
142
143        debug!("Cache MISS: all tiers");
144        CacheLookupResult::Miss
145    }
146
147    /// Store a response in the cache
148    ///
149    /// Writes to both L1 and L2 asynchronously (non-blocking).
150    /// This should be called after receiving a response from the LLM provider.
151    ///
152    /// # Performance
153    /// Non-blocking, returns immediately. Cache writes happen in background.
154    pub async fn store(&self, request: &CacheableRequest, response: CachedResponse) {
155        let cache_key = generate_cache_key(request);
156
157        // Write to L1 (fast, in-memory)
158        self.l1.set(cache_key.clone(), response.clone()).await;
159
160        // Write to L2 asynchronously (fire-and-forget)
161        if let Some(ref l2) = self.l2 {
162            let l2_clone = l2.clone();
163            let key_clone = cache_key.clone();
164            let response_clone = response.clone();
165
166            tokio::spawn(async move {
167                if let Err(e) = l2_clone.set(key_clone, response_clone).await {
168                    warn!("L2 cache write error: {}", e);
169                }
170            });
171        }
172    }
173
174    /// Store with custom L2 TTL
175    pub async fn store_with_ttl(
176        &self,
177        request: &CacheableRequest,
178        response: CachedResponse,
179        l2_ttl_seconds: u64,
180    ) {
181        let cache_key = generate_cache_key(request);
182
183        // Write to L1
184        self.l1.set(cache_key.clone(), response.clone()).await;
185
186        // Write to L2 with custom TTL
187        if let Some(ref l2) = self.l2 {
188            let l2_clone = l2.clone();
189            let key_clone = cache_key.clone();
190            let response_clone = response.clone();
191
192            tokio::spawn(async move {
193                if let Err(e) = l2_clone
194                    .set_with_ttl(key_clone, response_clone, l2_ttl_seconds)
195                    .await
196                {
197                    warn!("L2 cache write with TTL error: {}", e);
198                }
199            });
200        }
201    }
202
203    /// Invalidate a cache entry across all tiers
204    pub async fn invalidate(&self, request: &CacheableRequest) {
205        let cache_key = generate_cache_key(request);
206
207        // Remove from L1
208        self.l1.remove(&cache_key).await;
209
210        // Remove from L2
211        if let Some(ref l2) = self.l2 {
212            if let Err(e) = l2.remove(&cache_key).await {
213                warn!("L2 cache delete error: {}", e);
214            }
215        }
216    }
217
218    /// Clear all cache entries (use with caution!)
219    pub async fn clear_all(&self) {
220        info!("Clearing all cache tiers");
221
222        self.l1.clear().await;
223
224        if let Some(ref l2) = self.l2 {
225            if let Err(e) = l2.clear().await {
226                warn!("L2 cache clear error: {}", e);
227            }
228        }
229    }
230
231    /// Check health of cache tiers
232    pub async fn health_check(&self) -> CacheHealthStatus {
233        let l1_healthy = true; // L1 is always healthy (in-memory)
234        let l2_healthy = if let Some(ref l2) = self.l2 {
235            l2.health_check().await
236        } else {
237            false // L2 not configured
238        };
239
240        CacheHealthStatus {
241            l1_healthy,
242            l2_healthy,
243            l2_configured: self.l2.is_some(),
244        }
245    }
246
247    /// Get metrics snapshot
248    pub fn metrics_snapshot(&self) -> MetricsSnapshot {
249        self.metrics.snapshot()
250    }
251
252    /// Get L1 cache entry count
253    pub fn l1_entry_count(&self) -> u64 {
254        self.l1.entry_count()
255    }
256
257    /// Get L2 cache approximate size
258    pub async fn l2_approximate_size(&self) -> Option<usize> {
259        if let Some(ref l2) = self.l2 {
260            l2.approximate_size().await.ok()
261        } else {
262            None
263        }
264    }
265
266    /// Check if L2 is configured and available
267    pub fn has_l2(&self) -> bool {
268        self.l2.is_some()
269    }
270
271    /// Get shared metrics instance
272    pub fn metrics(&self) -> &CacheMetrics {
273        &self.metrics
274    }
275}
276
277impl Clone for CacheManager {
278    fn clone(&self) -> Self {
279        Self {
280            l1: L1Cache::with_config(self.l1.config().clone(), self.metrics.clone()),
281            l2: None, // L2 uses ConnectionManager which is Clone-able, but we'd need to expose it
282            metrics: self.metrics.clone(),
283        }
284    }
285}
286
287/// Cache health status
288#[derive(Debug, Clone)]
289pub struct CacheHealthStatus {
290    pub l1_healthy: bool,
291    pub l2_healthy: bool,
292    pub l2_configured: bool,
293}
294
295impl CacheHealthStatus {
296    pub fn is_fully_healthy(&self) -> bool {
297        if self.l2_configured {
298            self.l1_healthy && self.l2_healthy
299        } else {
300            self.l1_healthy
301        }
302    }
303}
304
305impl Default for CacheManager {
306    fn default() -> Self {
307        Self::new()
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314    use crate::l1::TokenUsage;
315    use chrono::Utc;
316
317    fn create_test_request() -> CacheableRequest {
318        CacheableRequest::new("gpt-4", "Hello, world!")
319            .with_temperature(0.7)
320            .with_max_tokens(100)
321    }
322
323    fn create_test_response(content: &str) -> CachedResponse {
324        CachedResponse {
325            content: content.to_string(),
326            tokens: Some(TokenUsage {
327                prompt_tokens: 10,
328                completion_tokens: 20,
329                total_tokens: 30,
330            }),
331            model: "gpt-4".to_string(),
332            cached_at: Utc::now().timestamp(),
333        }
334    }
335
336    #[tokio::test]
337    async fn test_cache_manager_l1_only() {
338        let cache = CacheManager::new();
339        let request = create_test_request();
340
341        // Initial lookup should miss
342        let result = cache.lookup(&request).await;
343        assert!(matches!(result, CacheLookupResult::Miss));
344
345        // Store value
346        cache
347            .store(&request, create_test_response("Test response"))
348            .await;
349
350        // Should hit L1 now
351        let result = cache.lookup(&request).await;
352        assert!(result.is_hit());
353        if let CacheLookupResult::L1Hit(response) = result {
354            assert_eq!(response.content, "Test response");
355        } else {
356            panic!("Expected L1 hit");
357        }
358    }
359}