Skip to main content

oxibonsai_runtime/
semantic_cache.rs

1//! Semantic caching layer for LLM inference.
2//!
3//! Returns cached responses for semantically similar queries (above a cosine
4//! similarity threshold), avoiding redundant model inference.  The cache uses
5//! TF-IDF embeddings and cosine similarity for semantic matching, with LRU-style
6//! eviction and TTL-based expiry.
7//!
8//! # Example
9//!
10//! ```rust
11//! use oxibonsai_runtime::semantic_cache::{CachedInference, SemanticCacheConfig};
12//!
13//! let config = SemanticCacheConfig::default();
14//! let ci = CachedInference::new(config);
15//!
16//! let (response, was_hit) = ci.run_or_cache(
17//!     "What is Rust programming language?",
18//!     || "Rust is a systems programming language focused on safety.".to_string(),
19//! );
20//! assert!(!was_hit);
21//!
22//! let (response2, was_hit2) = ci.run_or_cache(
23//!     "Tell me about the Rust language",
24//!     || "Rust is a memory-safe systems language.".to_string(),
25//! );
26//! // May or may not be a hit depending on similarity
27//! let _ = (response2, was_hit2);
28//! ```
29
30use std::sync::Mutex;
31use std::time::{Duration, Instant};
32
33use oxibonsai_rag::embedding::{Embedder, TfIdfEmbedder};
34use oxibonsai_rag::vector_store::cosine_similarity;
35
36// ─────────────────────────────────────────────────────────────────────────────
37// SemanticCacheConfig
38// ─────────────────────────────────────────────────────────────────────────────
39
40/// Configuration for semantic caching.
41#[derive(Debug, Clone)]
42pub struct SemanticCacheConfig {
43    /// Minimum cosine similarity to consider a cache hit (default: 0.92).
44    pub similarity_threshold: f32,
45    /// Maximum number of cached entries — LRU eviction when exceeded (default: 1000).
46    pub max_entries: usize,
47    /// TTL for cached entries (default: 1 hour).
48    pub ttl: Duration,
49    /// Whether to cache streaming responses (default: false).
50    pub cache_streaming: bool,
51    /// Minimum prompt length in characters to cache; short prompts vary too
52    /// much to benefit from semantic caching (default: 20).
53    pub min_prompt_chars: usize,
54}
55
56impl Default for SemanticCacheConfig {
57    fn default() -> Self {
58        Self {
59            similarity_threshold: 0.92,
60            max_entries: 1000,
61            ttl: Duration::from_secs(3600),
62            cache_streaming: false,
63            min_prompt_chars: 20,
64        }
65    }
66}
67
68// ─────────────────────────────────────────────────────────────────────────────
69// CachedResponse
70// ─────────────────────────────────────────────────────────────────────────────
71
72/// A cached LLM response returned on a semantic cache hit.
73#[derive(Debug, Clone)]
74pub struct CachedResponse {
75    /// The cached response text.
76    pub response: String,
77    /// The original prompt that produced this response.
78    pub prompt: String,
79    /// Cosine similarity between the lookup query and the stored prompt.
80    pub similarity: f32,
81    /// When this cache entry was created.
82    pub created_at: Instant,
83    /// How many times this entry has been returned as a cache hit.
84    pub hit_count: u64,
85}
86
87impl CachedResponse {
88    /// Returns `true` if this entry is older than `ttl`.
89    pub fn is_expired(&self, ttl: Duration) -> bool {
90        self.created_at.elapsed() > ttl
91    }
92
93    /// Time elapsed since this entry was created.
94    pub fn age(&self) -> Duration {
95        self.created_at.elapsed()
96    }
97}
98
99// ─────────────────────────────────────────────────────────────────────────────
100// CacheEntry (internal)
101// ─────────────────────────────────────────────────────────────────────────────
102
103/// Internal storage for a single cached prompt→response pair.
104struct CacheEntry {
105    prompt: String,
106    response: String,
107    /// L2-normalised TF-IDF embedding of `prompt`.
108    vector: Vec<f32>,
109    created_at: Instant,
110    /// Monotonically increasing access counter used for LRU ordering.
111    last_accessed: u64,
112    hit_count: u64,
113}
114
115// ─────────────────────────────────────────────────────────────────────────────
116// SemanticCacheStats
117// ─────────────────────────────────────────────────────────────────────────────
118
119/// Statistics about the cache, suitable for monitoring and dashboards.
120#[derive(Debug, Clone, serde::Serialize)]
121pub struct SemanticCacheStats {
122    /// Total number of lookup attempts (hits + misses).
123    pub total_requests: u64,
124    /// Number of lookups that returned a cached response.
125    pub cache_hits: u64,
126    /// Number of lookups that did not find a matching entry.
127    pub cache_misses: u64,
128    /// Cache hit rate in `[0.0, 1.0]`.
129    pub hit_rate: f32,
130    /// Current number of entries in the cache.
131    pub entries: usize,
132    /// Number of LRU-based evictions (capacity exceeded).
133    pub evictions: u64,
134    /// Number of TTL-based evictions.
135    pub expired_evictions: u64,
136    /// Mean cosine similarity score across all cache hits.
137    pub avg_similarity_on_hit: f32,
138}
139
140impl Default for SemanticCacheStats {
141    fn default() -> Self {
142        Self {
143            total_requests: 0,
144            cache_hits: 0,
145            cache_misses: 0,
146            hit_rate: 0.0,
147            entries: 0,
148            evictions: 0,
149            expired_evictions: 0,
150            avg_similarity_on_hit: 0.0,
151        }
152    }
153}
154
155// ─────────────────────────────────────────────────────────────────────────────
156// SemanticCache
157// ─────────────────────────────────────────────────────────────────────────────
158
159/// Semantic cache using TF-IDF embeddings and cosine similarity.
160///
161/// The cache embeds every incoming prompt with a refittable TF-IDF model and
162/// performs a brute-force cosine search over stored entries.  When a result
163/// above [`SemanticCacheConfig::similarity_threshold`] is found and has not
164/// expired, the stored response is returned without running inference.
165///
166/// Thread-safety: all fields are guarded by `Mutex`.  The cache is `Send +
167/// Sync` and can be shared across threads via `Arc<SemanticCache>`.
168pub struct SemanticCache {
169    config: SemanticCacheConfig,
170    entries: Mutex<Vec<CacheEntry>>,
171    embedder: Mutex<TfIdfEmbedder>,
172    stats: Mutex<SemanticCacheStats>,
173    /// All prompts ever inserted — used to refit the TF-IDF embedder.
174    all_prompts: Mutex<Vec<String>>,
175    /// Global access clock for LRU ordering.
176    access_clock: Mutex<u64>,
177    /// Sum of similarity scores across all hits (for computing the mean).
178    similarity_sum: Mutex<f64>,
179}
180
181/// Embedding dimension used for the bootstrap TF-IDF model (before any prompts
182/// have been inserted).  A small positive value avoids zero-dim panics.
183const BOOTSTRAP_DIM: usize = 64;
184
185/// Minimum number of new prompts that must accumulate before the embedder is
186/// refitted.  Refitting is expensive, so we batch updates.
187const REFIT_BATCH_SIZE: usize = 16;
188
189impl SemanticCache {
190    /// Create a new [`SemanticCache`] with the given configuration.
191    ///
192    /// The TF-IDF embedder is bootstrapped with synthetic vocabulary so that
193    /// `lookup` calls before any `insert` return gracefully.
194    pub fn new(config: SemanticCacheConfig) -> Self {
195        // Bootstrap embedder: fit on a tiny synthetic corpus so that dim > 0.
196        let bootstrap_docs = [
197            "hello world query prompt response cache",
198            "semantic similarity cosine embedding language model",
199            "retrieval augmented generation inference rust",
200        ];
201        let embedder = TfIdfEmbedder::fit(&bootstrap_docs, BOOTSTRAP_DIM);
202
203        Self {
204            config,
205            entries: Mutex::new(Vec::new()),
206            embedder: Mutex::new(embedder),
207            stats: Mutex::new(SemanticCacheStats::default()),
208            all_prompts: Mutex::new(Vec::new()),
209            access_clock: Mutex::new(0),
210            similarity_sum: Mutex::new(0.0),
211        }
212    }
213
214    // ── Public API ────────────────────────────────────────────────────────────
215
216    /// Check whether a semantically similar response is cached.
217    ///
218    /// Returns `None` on a miss, or when the best-matching entry has expired.
219    /// On a hit, the entry's `hit_count` and the global access clock are updated.
220    pub fn lookup(&self, prompt: &str) -> Option<CachedResponse> {
221        if !self.is_cacheable(prompt) {
222            let mut stats = self.stats.lock().expect("stats lock poisoned");
223            stats.total_requests += 1;
224            stats.cache_misses += 1;
225            self.update_hit_rate(&mut stats);
226            return None;
227        }
228
229        // Embed the query using the current embedder.
230        let query_vec = {
231            let embedder = self.embedder.lock().expect("embedder lock poisoned");
232            match embedder.embed(prompt) {
233                Ok(v) => v,
234                Err(_) => {
235                    let mut stats = self.stats.lock().expect("stats lock poisoned");
236                    stats.total_requests += 1;
237                    stats.cache_misses += 1;
238                    self.update_hit_rate(&mut stats);
239                    return None;
240                }
241            }
242        };
243
244        let mut entries = self.entries.lock().expect("entries lock poisoned");
245        let ttl = self.config.ttl;
246        let threshold = self.config.similarity_threshold;
247
248        // Find the best non-expired match above the threshold.
249        let mut best_score = f32::NEG_INFINITY;
250        let mut best_idx: Option<usize> = None;
251
252        for (idx, entry) in entries.iter().enumerate() {
253            if entry.created_at.elapsed() > ttl {
254                continue; // skip expired
255            }
256            if entry.vector.len() != query_vec.len() {
257                continue; // dimension mismatch after a refit
258            }
259            let score = cosine_similarity(&query_vec, &entry.vector);
260            if score >= threshold && score > best_score {
261                best_score = score;
262                best_idx = Some(idx);
263            }
264        }
265
266        let mut stats = self.stats.lock().expect("stats lock poisoned");
267        stats.total_requests += 1;
268
269        match best_idx {
270            Some(idx) => {
271                // Advance access clock for LRU tracking.
272                let clock = {
273                    let mut c = self.access_clock.lock().expect("clock lock poisoned");
274                    *c += 1;
275                    *c
276                };
277                let entry = &mut entries[idx];
278                entry.hit_count += 1;
279                entry.last_accessed = clock;
280
281                let response = CachedResponse {
282                    response: entry.response.clone(),
283                    prompt: entry.prompt.clone(),
284                    similarity: best_score,
285                    created_at: entry.created_at,
286                    hit_count: entry.hit_count,
287                };
288
289                stats.cache_hits += 1;
290                self.update_hit_rate(&mut stats);
291
292                // Update rolling average similarity.
293                {
294                    let mut sim_sum = self
295                        .similarity_sum
296                        .lock()
297                        .expect("similarity_sum lock poisoned");
298                    *sim_sum += best_score as f64;
299                    stats.avg_similarity_on_hit = (*sim_sum / stats.cache_hits as f64) as f32;
300                }
301
302                Some(response)
303            }
304            None => {
305                stats.cache_misses += 1;
306                self.update_hit_rate(&mut stats);
307                None
308            }
309        }
310    }
311
312    /// Store a new `prompt`→`response` mapping in the cache.
313    ///
314    /// If the cache is at capacity, the least-recently-used entry is evicted.
315    /// The TF-IDF embedder is refitted periodically as new prompts accumulate.
316    pub fn insert(&self, prompt: &str, response: &str) {
317        if !self.is_cacheable(prompt) {
318            return;
319        }
320
321        // Add to the all_prompts list; refit if we've accumulated enough new ones.
322        {
323            let mut all_prompts = self.all_prompts.lock().expect("all_prompts lock poisoned");
324            all_prompts.push(prompt.to_string());
325
326            // Refit when: first insertion, or every REFIT_BATCH_SIZE new prompts.
327            let should_refit = all_prompts.len() == 1 || all_prompts.len() % REFIT_BATCH_SIZE == 0;
328            drop(all_prompts); // release before calling refit_embedder
329
330            if should_refit {
331                self.refit_embedder();
332            }
333        }
334
335        // Embed with the (possibly just refitted) embedder.
336        let vector = {
337            let embedder = self.embedder.lock().expect("embedder lock poisoned");
338            match embedder.embed(prompt) {
339                Ok(v) => v,
340                Err(_) => return, // silently skip unembed-able prompts
341            }
342        };
343
344        let clock = {
345            let mut c = self.access_clock.lock().expect("clock lock poisoned");
346            *c += 1;
347            *c
348        };
349
350        let mut entries = self.entries.lock().expect("entries lock poisoned");
351
352        // Evict LRU entry if at capacity.
353        if entries.len() >= self.config.max_entries {
354            let lru_idx = entries
355                .iter()
356                .enumerate()
357                .min_by_key(|(_, e)| e.last_accessed)
358                .map(|(i, _)| i)
359                .expect("entries is non-empty");
360            entries.swap_remove(lru_idx);
361
362            let mut stats = self.stats.lock().expect("stats lock poisoned");
363            stats.evictions += 1;
364        }
365
366        entries.push(CacheEntry {
367            prompt: prompt.to_string(),
368            response: response.to_string(),
369            vector,
370            created_at: Instant::now(),
371            last_accessed: clock,
372            hit_count: 0,
373        });
374
375        let mut stats = self.stats.lock().expect("stats lock poisoned");
376        stats.entries = entries.len();
377    }
378
379    /// Remove all expired entries from the cache.
380    ///
381    /// Returns the number of entries that were removed.
382    pub fn evict_expired(&self) -> usize {
383        let ttl = self.config.ttl;
384        let mut entries = self.entries.lock().expect("entries lock poisoned");
385        let before = entries.len();
386        entries.retain(|e| e.created_at.elapsed() <= ttl);
387        let removed = before - entries.len();
388
389        let mut stats = self.stats.lock().expect("stats lock poisoned");
390        stats.expired_evictions += removed as u64;
391        stats.entries = entries.len();
392
393        removed
394    }
395
396    /// Remove all entries and reset statistics.
397    pub fn clear(&self) {
398        self.entries.lock().expect("entries lock poisoned").clear();
399        self.all_prompts
400            .lock()
401            .expect("all_prompts lock poisoned")
402            .clear();
403        *self
404            .similarity_sum
405            .lock()
406            .expect("similarity_sum lock poisoned") = 0.0;
407        *self.stats.lock().expect("stats lock poisoned") = SemanticCacheStats::default();
408    }
409
410    /// Current number of entries in the cache.
411    pub fn len(&self) -> usize {
412        self.entries.lock().expect("entries lock poisoned").len()
413    }
414
415    /// Returns `true` if the cache contains no entries.
416    pub fn is_empty(&self) -> bool {
417        self.len() == 0
418    }
419
420    /// Snapshot of current cache statistics.
421    pub fn stats(&self) -> SemanticCacheStats {
422        self.stats.lock().expect("stats lock poisoned").clone()
423    }
424
425    // ── Private helpers ───────────────────────────────────────────────────────
426
427    /// Returns `true` if `prompt` is long enough to benefit from caching.
428    fn is_cacheable(&self, prompt: &str) -> bool {
429        prompt.len() >= self.config.min_prompt_chars
430    }
431
432    /// Refit the TF-IDF embedder using all prompts accumulated so far.
433    ///
434    /// After refitting, the dimension may change.  Existing entries whose
435    /// vector dimension no longer matches are implicitly skipped at lookup time
436    /// and will be replaced naturally as new entries arrive.
437    fn refit_embedder(&self) {
438        let all_prompts = self.all_prompts.lock().expect("all_prompts lock poisoned");
439        if all_prompts.is_empty() {
440            return;
441        }
442
443        // Determine a reasonable max_features: at least BOOTSTRAP_DIM, at most
444        // 4× the number of prompts to avoid a huge sparse vocabulary.
445        let max_features = BOOTSTRAP_DIM.max(all_prompts.len() * 4).min(4096);
446
447        let doc_refs: Vec<&str> = all_prompts.iter().map(|s| s.as_str()).collect();
448        let new_embedder = TfIdfEmbedder::fit(&doc_refs, max_features);
449        drop(all_prompts);
450
451        let mut embedder = self.embedder.lock().expect("embedder lock poisoned");
452        *embedder = new_embedder;
453    }
454
455    /// Update the `hit_rate` field of `stats` from its raw counters.
456    fn update_hit_rate(&self, stats: &mut SemanticCacheStats) {
457        stats.hit_rate = if stats.total_requests == 0 {
458            0.0
459        } else {
460            stats.cache_hits as f32 / stats.total_requests as f32
461        };
462    }
463}
464
465// ─────────────────────────────────────────────────────────────────────────────
466// CachedInference
467// ─────────────────────────────────────────────────────────────────────────────
468
469/// Middleware wrapper that checks the semantic cache before running inference.
470///
471/// ```rust
472/// use oxibonsai_runtime::semantic_cache::{CachedInference, SemanticCacheConfig};
473///
474/// let ci = CachedInference::new(SemanticCacheConfig::default());
475///
476/// // First call: cache miss — closure runs.
477/// let (resp, hit) = ci.run_or_cache(
478///     "What is the capital of France?",
479///     || "Paris is the capital of France.".to_string(),
480/// );
481/// assert!(!hit);
482/// assert_eq!(resp, "Paris is the capital of France.");
483/// ```
484pub struct CachedInference {
485    /// The underlying semantic cache.  Exposed so callers can inspect stats.
486    pub cache: SemanticCache,
487}
488
489impl CachedInference {
490    /// Create a new [`CachedInference`] backed by a freshly initialised cache.
491    pub fn new(config: SemanticCacheConfig) -> Self {
492        Self {
493            cache: SemanticCache::new(config),
494        }
495    }
496
497    /// Return a cached response if one exists, otherwise invoke `run_inference`
498    /// and store its result.
499    ///
500    /// # Returns
501    ///
502    /// `(response, was_cache_hit)` — the response string and whether it came
503    /// from the cache.
504    pub fn run_or_cache<F>(&self, prompt: &str, run_inference: F) -> (String, bool)
505    where
506        F: FnOnce() -> String,
507    {
508        // Check cache first.
509        if let Some(cached) = self.cache.lookup(prompt) {
510            return (cached.response, true);
511        }
512
513        // Cache miss: run inference and store the result.
514        let response = run_inference();
515        self.cache.insert(prompt, &response);
516        (response, false)
517    }
518}
519
520// ─────────────────────────────────────────────────────────────────────────────
521// Tests
522// ─────────────────────────────────────────────────────────────────────────────
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527
528    fn short_ttl_config() -> SemanticCacheConfig {
529        SemanticCacheConfig {
530            ttl: Duration::from_millis(50),
531            ..Default::default()
532        }
533    }
534
535    fn low_threshold_config() -> SemanticCacheConfig {
536        SemanticCacheConfig {
537            similarity_threshold: 0.1,
538            ..Default::default()
539        }
540    }
541
542    // ── Basic miss / hit ──────────────────────────────────────────────────────
543
544    #[test]
545    fn test_semantic_cache_miss_on_empty() {
546        let cache = SemanticCache::new(SemanticCacheConfig::default());
547        assert!(cache.lookup("What is the meaning of life?").is_none());
548    }
549
550    #[test]
551    fn test_semantic_cache_exact_match() {
552        let cache = SemanticCache::new(low_threshold_config());
553        let prompt = "What is the capital of France and why is it important?";
554        cache.insert(prompt, "Paris is the capital of France.");
555        let result = cache.lookup(prompt);
556        assert!(result.is_some(), "exact prompt should hit the cache");
557        let cached = result.expect("just asserted Some");
558        assert_eq!(cached.response, "Paris is the capital of France.");
559        // Exact match should yield similarity ≈ 1.0
560        assert!(cached.similarity > 0.9, "similarity={}", cached.similarity);
561    }
562
563    #[test]
564    fn test_semantic_cache_insert_and_lookup() {
565        let config = SemanticCacheConfig {
566            similarity_threshold: 0.5,
567            ..Default::default()
568        };
569        let cache = SemanticCache::new(config);
570        let prompt = "Explain the concept of machine learning in detail";
571        cache.insert(prompt, "Machine learning is a branch of AI.");
572        assert_eq!(cache.len(), 1);
573        let hit = cache.lookup(prompt);
574        assert!(hit.is_some());
575    }
576
577    // ── TTL expiry ────────────────────────────────────────────────────────────
578
579    #[test]
580    fn test_semantic_cache_ttl_expiry() {
581        let config = short_ttl_config();
582        let cache = SemanticCache::new(config);
583        let prompt = "Tell me everything about neural networks and deep learning";
584        cache.insert(prompt, "Neural networks are computational graphs.");
585        // Should be a hit immediately.
586        assert!(
587            cache.lookup(prompt).is_some(),
588            "should hit before TTL expires"
589        );
590        // Wait for TTL to expire.
591        std::thread::sleep(Duration::from_millis(100));
592        // Should be a miss now.
593        assert!(
594            cache.lookup(prompt).is_none(),
595            "should miss after TTL expires"
596        );
597    }
598
599    // ── Min prompt length ─────────────────────────────────────────────────────
600
601    #[test]
602    fn test_semantic_cache_min_prompt_length() {
603        let cache = SemanticCache::new(SemanticCacheConfig::default());
604        // Default min_prompt_chars = 20
605        let short = "Hi";
606        cache.insert(short, "Hello!");
607        assert_eq!(cache.len(), 0, "short prompt should not be cached");
608        assert!(cache.lookup(short).is_none());
609    }
610
611    // ── Evict expired ─────────────────────────────────────────────────────────
612
613    #[test]
614    fn test_semantic_cache_evict_expired() {
615        let config = short_ttl_config();
616        let cache = SemanticCache::new(config);
617
618        for i in 0..5 {
619            let prompt = format!(
620                "This is a sufficiently long prompt number {} for caching purposes",
621                i
622            );
623            cache.insert(&prompt, "response");
624        }
625        assert_eq!(cache.len(), 5);
626
627        std::thread::sleep(Duration::from_millis(100));
628        let removed = cache.evict_expired();
629        assert_eq!(removed, 5, "all entries should have expired");
630        assert_eq!(cache.len(), 0);
631
632        let stats = cache.stats();
633        assert_eq!(stats.expired_evictions, 5);
634    }
635
636    // ── Statistics ────────────────────────────────────────────────────────────
637
638    #[test]
639    fn test_semantic_cache_stats_hit_rate() {
640        let config = low_threshold_config();
641        let cache = SemanticCache::new(config);
642
643        let prompt = "Describe the architecture of transformer neural networks in depth";
644        cache.insert(prompt, "Transformers use attention mechanisms.");
645
646        // 1 hit
647        let _ = cache.lookup(prompt);
648        // 1 miss (nothing similar)
649        let _ = cache.lookup("Completely unrelated gibberish zzzzzzzz that matches nothing");
650
651        let stats = cache.stats();
652        assert_eq!(stats.cache_hits, 1);
653        assert_eq!(stats.cache_misses, 1);
654        assert_eq!(stats.total_requests, 2);
655        assert!(
656            (stats.hit_rate - 0.5).abs() < 1e-5,
657            "hit_rate={}",
658            stats.hit_rate
659        );
660    }
661
662    // ── Clear ─────────────────────────────────────────────────────────────────
663
664    #[test]
665    fn test_semantic_cache_clear() {
666        let config = low_threshold_config();
667        let cache = SemanticCache::new(config);
668
669        for i in 0..10 {
670            let prompt = format!(
671                "This is prompt number {} that is long enough to be cached by the system",
672                i
673            );
674            cache.insert(&prompt, "some response");
675        }
676        assert!(!cache.is_empty());
677        cache.clear();
678        assert!(cache.is_empty());
679        assert_eq!(cache.stats().total_requests, 0);
680    }
681
682    // ── CachedInference ───────────────────────────────────────────────────────
683
684    #[test]
685    fn test_cached_inference_returns_cached() {
686        let config = low_threshold_config();
687        let ci = CachedInference::new(config);
688
689        let prompt = "What is Rust and why is it used for systems programming?";
690        let (r1, hit1) = ci.run_or_cache(prompt, || "Rust is a systems language.".to_string());
691        assert!(!hit1, "first call must be a miss");
692        assert_eq!(r1, "Rust is a systems language.");
693
694        let (r2, hit2) = ci.run_or_cache(prompt, || panic!("should not be called"));
695        assert!(hit2, "second identical call must be a hit");
696        assert_eq!(r2, "Rust is a systems language.");
697    }
698
699    #[test]
700    fn test_cached_inference_calls_fn_on_miss() {
701        let ci = CachedInference::new(SemanticCacheConfig::default());
702        let mut called = false;
703        let (resp, hit) = ci.run_or_cache(
704            "Explain quantum entanglement in detail for a physics student",
705            || {
706                called = true;
707                "Quantum entanglement is a phenomenon…".to_string()
708            },
709        );
710        assert!(!hit);
711        assert!(called);
712        assert!(!resp.is_empty());
713    }
714
715    // ── Config defaults ───────────────────────────────────────────────────────
716
717    #[test]
718    fn test_cache_config_defaults() {
719        let cfg = SemanticCacheConfig::default();
720        assert!((cfg.similarity_threshold - 0.92).abs() < 1e-6);
721        assert_eq!(cfg.max_entries, 1000);
722        assert_eq!(cfg.ttl, Duration::from_secs(3600));
723        assert!(!cfg.cache_streaming);
724        assert_eq!(cfg.min_prompt_chars, 20);
725    }
726
727    // ── CachedResponse helpers ────────────────────────────────────────────────
728
729    #[test]
730    fn test_cached_response_is_expired() {
731        let resp = CachedResponse {
732            response: "answer".to_string(),
733            prompt: "question".to_string(),
734            similarity: 0.95,
735            created_at: Instant::now(),
736            hit_count: 1,
737        };
738        assert!(!resp.is_expired(Duration::from_secs(60)));
739        // Simulate an old entry by checking with a zero duration.
740        // Elapsed > 0 so even a zero TTL should be expired.
741        std::thread::sleep(Duration::from_millis(1));
742        assert!(resp.is_expired(Duration::ZERO));
743    }
744}