1use std::collections::HashMap;
33use std::hash::{DefaultHasher, Hash, Hasher};
34use std::sync::atomic::{AtomicU64, Ordering};
35use std::sync::RwLock;
36
37#[derive(Debug, Clone)]
39pub struct CachedToken {
40 pub surface: String,
42 pub pos: String,
44 pub start_byte: usize,
46 pub end_byte: usize,
48}
49
50pub type CacheKey = u64;
52
53#[derive(Debug, Clone)]
55pub struct CacheConfig {
56 pub max_entries: usize,
58 pub max_key_length: usize,
60 pub track_stats: bool,
62}
63
64impl Default for CacheConfig {
65 fn default() -> Self {
66 Self {
67 max_entries: 10_000,
68 max_key_length: 1024,
69 track_stats: true,
70 }
71 }
72}
73
74impl CacheConfig {
75 #[must_use]
77 pub const fn new() -> Self {
78 Self {
79 max_entries: 10_000,
80 max_key_length: 1024,
81 track_stats: true,
82 }
83 }
84
85 #[must_use]
87 pub const fn with_max_entries(mut self, max: usize) -> Self {
88 self.max_entries = max;
89 self
90 }
91
92 #[must_use]
94 pub const fn with_max_key_length(mut self, max: usize) -> Self {
95 self.max_key_length = max;
96 self
97 }
98
99 #[must_use]
101 pub const fn with_track_stats(mut self, track: bool) -> Self {
102 self.track_stats = track;
103 self
104 }
105}
106
107#[derive(Debug, Default)]
109pub struct CacheStats {
110 hits: AtomicU64,
112 misses: AtomicU64,
114 evictions: AtomicU64,
116}
117
118impl CacheStats {
119 #[must_use]
121 pub fn hits(&self) -> u64 {
122 self.hits.load(Ordering::Relaxed)
123 }
124
125 #[must_use]
127 pub fn misses(&self) -> u64 {
128 self.misses.load(Ordering::Relaxed)
129 }
130
131 #[must_use]
133 pub fn total_requests(&self) -> u64 {
134 self.hits() + self.misses()
135 }
136
137 #[must_use]
139 #[allow(clippy::cast_precision_loss)]
140 pub fn hit_rate(&self) -> f64 {
141 let total = self.total_requests();
142 if total == 0 {
143 0.0
144 } else {
145 self.hits() as f64 / total as f64
146 }
147 }
148
149 #[must_use]
151 pub fn evictions(&self) -> u64 {
152 self.evictions.load(Ordering::Relaxed)
153 }
154
155 fn record_hit(&self) {
156 self.hits.fetch_add(1, Ordering::Relaxed);
157 }
158
159 fn record_miss(&self) {
160 self.misses.fetch_add(1, Ordering::Relaxed);
161 }
162
163 fn record_eviction(&self) {
164 self.evictions.fetch_add(1, Ordering::Relaxed);
165 }
166
167 pub fn reset(&self) {
169 self.hits.store(0, Ordering::Relaxed);
170 self.misses.store(0, Ordering::Relaxed);
171 self.evictions.store(0, Ordering::Relaxed);
172 }
173}
174
175struct CacheEntry {
177 tokens: Vec<CachedToken>,
179 last_access: u64,
181}
182
183pub struct TokenCache {
185 config: CacheConfig,
186 entries: RwLock<HashMap<CacheKey, CacheEntry>>,
187 stats: CacheStats,
188 access_counter: AtomicU64,
189}
190
191impl TokenCache {
192 #[must_use]
194 pub fn new(config: CacheConfig) -> Self {
195 Self {
196 config,
197 entries: RwLock::new(HashMap::new()),
198 stats: CacheStats::default(),
199 access_counter: AtomicU64::new(0),
200 }
201 }
202
203 #[must_use]
205 pub fn with_defaults() -> Self {
206 Self::new(CacheConfig::default())
207 }
208
209 #[must_use]
211 pub fn make_key(&self, text: &str) -> CacheKey {
212 let mut hasher = DefaultHasher::new();
213 text.hash(&mut hasher);
214 hasher.finish()
215 }
216
217 #[must_use]
219 pub fn get(&self, key: CacheKey) -> Option<Vec<CachedToken>> {
220 let mut entries = self.entries.write().ok()?;
221
222 if let Some(entry) = entries.get_mut(&key) {
223 entry.last_access = self.access_counter.fetch_add(1, Ordering::Relaxed);
224 if self.config.track_stats {
225 self.stats.record_hit();
226 }
227 Some(entry.tokens.clone())
228 } else {
229 if self.config.track_stats {
230 self.stats.record_miss();
231 }
232 None
233 }
234 }
235
236 pub fn insert(&self, key: CacheKey, tokens: Vec<CachedToken>) {
238 let Ok(mut entries) = self.entries.write() else {
239 return;
240 };
241
242 while entries.len() >= self.config.max_entries {
244 self.evict_lru(&mut entries);
245 }
246
247 let access = self.access_counter.fetch_add(1, Ordering::Relaxed);
248 entries.insert(key, CacheEntry {
249 tokens,
250 last_access: access,
251 });
252 }
253
254 pub fn get_or_insert<F>(&self, key: CacheKey, compute: F) -> Vec<CachedToken>
256 where
257 F: FnOnce() -> Vec<CachedToken>,
258 {
259 if let Some(tokens) = self.get(key) {
261 return tokens;
262 }
263
264 let tokens = compute();
266 self.insert(key, tokens.clone());
267 tokens
268 }
269
270 pub fn get_or_insert_with_text<F>(&self, text: &str, compute: F) -> Vec<CachedToken>
272 where
273 F: FnOnce() -> Vec<CachedToken>,
274 {
275 if text.len() > self.config.max_key_length {
277 return compute();
278 }
279
280 let key = self.make_key(text);
281 self.get_or_insert(key, compute)
282 }
283
284 fn evict_lru(&self, entries: &mut HashMap<CacheKey, CacheEntry>) {
286 if entries.is_empty() {
287 return;
288 }
289
290 let oldest_key = entries
292 .iter()
293 .min_by_key(|(_, entry)| entry.last_access)
294 .map(|(key, _)| *key);
295
296 if let Some(key) = oldest_key {
297 entries.remove(&key);
298 if self.config.track_stats {
299 self.stats.record_eviction();
300 }
301 }
302 }
303
304 pub fn clear(&self) {
306 if let Ok(mut entries) = self.entries.write() {
307 entries.clear();
308 }
309 }
310
311 #[must_use]
313 pub fn len(&self) -> usize {
314 self.entries.read().map(|e| e.len()).unwrap_or(0)
315 }
316
317 #[must_use]
319 pub fn is_empty(&self) -> bool {
320 self.len() == 0
321 }
322
323 #[must_use]
325 pub const fn stats(&self) -> &CacheStats {
326 &self.stats
327 }
328
329 #[must_use]
331 pub const fn config(&self) -> &CacheConfig {
332 &self.config
333 }
334}
335
336impl Default for TokenCache {
337 fn default() -> Self {
338 Self::with_defaults()
339 }
340}
341
342pub struct CachingTokenizer<T> {
344 inner: T,
345 cache: TokenCache,
346}
347
348impl<T> CachingTokenizer<T> {
349 pub fn new(inner: T, config: CacheConfig) -> Self {
351 Self {
352 inner,
353 cache: TokenCache::new(config),
354 }
355 }
356
357 pub fn with_defaults(inner: T) -> Self {
359 Self::new(inner, CacheConfig::default())
360 }
361
362 #[must_use]
364 pub const fn inner(&self) -> &T {
365 &self.inner
366 }
367
368 pub fn inner_mut(&mut self) -> &mut T {
370 &mut self.inner
371 }
372
373 #[must_use]
375 pub const fn cache(&self) -> &TokenCache {
376 &self.cache
377 }
378
379 #[must_use]
381 pub const fn stats(&self) -> &CacheStats {
382 self.cache.stats()
383 }
384
385 pub fn clear_cache(&self) {
387 self.cache.clear();
388 }
389}
390
391#[cfg(test)]
392mod tests {
393 use super::*;
394
395 #[test]
396 fn test_cache_config_default() {
397 let config = CacheConfig::default();
398 assert_eq!(config.max_entries, 10_000);
399 assert_eq!(config.max_key_length, 1024);
400 assert!(config.track_stats);
401 }
402
403 #[test]
404 fn test_cache_config_builder() {
405 let config = CacheConfig::new()
406 .with_max_entries(1000)
407 .with_max_key_length(512)
408 .with_track_stats(false);
409
410 assert_eq!(config.max_entries, 1000);
411 assert_eq!(config.max_key_length, 512);
412 assert!(!config.track_stats);
413 }
414
415 #[test]
416 fn test_cache_basic_operations() {
417 let cache = TokenCache::with_defaults();
418
419 let key = cache.make_key("테스트");
420
421 assert!(cache.get(key).is_none());
423 assert_eq!(cache.stats().misses(), 1);
424
425 let tokens = vec![CachedToken {
427 surface: "테스트".to_string(),
428 pos: "NNG".to_string(),
429 start_byte: 0,
430 end_byte: 9,
431 }];
432 cache.insert(key, tokens.clone());
433
434 let cached = cache.get(key).unwrap();
436 assert_eq!(cached.len(), 1);
437 assert_eq!(cached[0].surface, "테스트");
438 assert_eq!(cache.stats().hits(), 1);
439 }
440
441 #[test]
442 fn test_cache_get_or_insert() {
443 let cache = TokenCache::with_defaults();
444
445 let key = cache.make_key("안녕");
446 let mut call_count = 0;
447
448 let tokens1 = cache.get_or_insert(key, || {
450 call_count += 1;
451 vec![CachedToken {
452 surface: "안녕".to_string(),
453 pos: "IC".to_string(),
454 start_byte: 0,
455 end_byte: 6,
456 }]
457 });
458 assert_eq!(call_count, 1);
459 assert_eq!(tokens1.len(), 1);
460
461 let tokens2 = cache.get_or_insert(key, || {
463 call_count += 1;
464 vec![]
465 });
466 assert_eq!(call_count, 1); assert_eq!(tokens2.len(), 1);
468 }
469
470 #[test]
471 fn test_cache_lru_eviction() {
472 let config = CacheConfig::new().with_max_entries(3);
473 let cache = TokenCache::new(config);
474
475 for i in 0..3 {
477 let key = cache.make_key(&format!("text{i}"));
478 cache.insert(key, vec![]);
479 }
480 assert_eq!(cache.len(), 3);
481
482 let key0 = cache.make_key("text0");
484 let _ = cache.get(key0);
485
486 let key3 = cache.make_key("text3");
488 cache.insert(key3, vec![]);
489 assert_eq!(cache.len(), 3);
490 assert_eq!(cache.stats().evictions(), 1);
491
492 assert!(cache.get(key0).is_some());
494
495 let key1 = cache.make_key("text1");
497 assert!(cache.get(key1).is_none());
498 }
499
500 #[test]
501 fn test_cache_stats() {
502 let cache = TokenCache::with_defaults();
503
504 let key = cache.make_key("test");
505
506 let _ = cache.get(key);
508 assert_eq!(cache.stats().misses(), 1);
509 assert_eq!(cache.stats().hits(), 0);
510 assert!((cache.stats().hit_rate() - 0.0).abs() < f64::EPSILON);
511
512 cache.insert(key, vec![]);
514 let _ = cache.get(key);
515 assert_eq!(cache.stats().hits(), 1);
516 assert!((cache.stats().hit_rate() - 0.5).abs() < f64::EPSILON);
517
518 cache.stats().reset();
520 assert_eq!(cache.stats().total_requests(), 0);
521 }
522
523 #[test]
524 fn test_cache_clear() {
525 let cache = TokenCache::with_defaults();
526
527 for i in 0..10 {
528 let key = cache.make_key(&format!("text{i}"));
529 cache.insert(key, vec![]);
530 }
531 assert_eq!(cache.len(), 10);
532
533 cache.clear();
534 assert_eq!(cache.len(), 0);
535 assert!(cache.is_empty());
536 }
537
538 #[test]
539 fn test_cache_skip_long_text() {
540 let config = CacheConfig::new().with_max_key_length(10);
541 let cache = TokenCache::new(config);
542
543 let mut call_count = 0;
544
545 let short = "짧은";
547 cache.get_or_insert_with_text(short, || {
548 call_count += 1;
549 vec![]
550 });
551 cache.get_or_insert_with_text(short, || {
552 call_count += 1;
553 vec![]
554 });
555 assert_eq!(call_count, 1);
556
557 let long = "이것은 아주 긴 텍스트입니다";
559 cache.get_or_insert_with_text(long, || {
560 call_count += 1;
561 vec![]
562 });
563 cache.get_or_insert_with_text(long, || {
564 call_count += 1;
565 vec![]
566 });
567 assert_eq!(call_count, 3); }
569
570 #[test]
571 fn test_caching_tokenizer() {
572 struct DummyTokenizer;
573
574 let caching = CachingTokenizer::with_defaults(DummyTokenizer);
575
576 assert!(caching.cache().is_empty());
577 assert_eq!(caching.stats().total_requests(), 0);
578
579 let key = caching.cache().make_key("test");
581 caching.cache().insert(key, vec![]);
582
583 assert_eq!(caching.cache().len(), 1);
584
585 caching.clear_cache();
587 assert!(caching.cache().is_empty());
588 }
589}