1use std::collections::HashMap;
33use std::hash::{DefaultHasher, Hash, Hasher};
34use std::sync::atomic::{AtomicU64, Ordering};
35use std::sync::RwLock;
36
37#[derive(Debug, Clone)]
39pub struct CachedToken {
40 pub surface: String,
42 pub pos: String,
44 pub start_byte: usize,
46 pub end_byte: usize,
48}
49
50pub type CacheKey = u64;
52
53#[derive(Debug, Clone)]
55pub struct CacheConfig {
56 pub max_entries: usize,
58 pub max_key_length: usize,
60 pub track_stats: bool,
62}
63
64impl Default for CacheConfig {
65 fn default() -> Self {
66 Self {
67 max_entries: 10_000,
68 max_key_length: 1024,
69 track_stats: true,
70 }
71 }
72}
73
74impl CacheConfig {
75 #[must_use]
77 pub const fn new() -> Self {
78 Self {
79 max_entries: 10_000,
80 max_key_length: 1024,
81 track_stats: true,
82 }
83 }
84
85 #[must_use]
87 pub const fn with_max_entries(mut self, max: usize) -> Self {
88 self.max_entries = max;
89 self
90 }
91
92 #[must_use]
94 pub const fn with_max_key_length(mut self, max: usize) -> Self {
95 self.max_key_length = max;
96 self
97 }
98
99 #[must_use]
101 pub const fn with_track_stats(mut self, track: bool) -> Self {
102 self.track_stats = track;
103 self
104 }
105}
106
107#[derive(Debug, Default)]
109pub struct CacheStats {
110 hits: AtomicU64,
112 misses: AtomicU64,
114 evictions: AtomicU64,
116}
117
118impl CacheStats {
119 #[must_use]
121 pub fn hits(&self) -> u64 {
122 self.hits.load(Ordering::Relaxed)
123 }
124
125 #[must_use]
127 pub fn misses(&self) -> u64 {
128 self.misses.load(Ordering::Relaxed)
129 }
130
131 #[must_use]
133 pub fn total_requests(&self) -> u64 {
134 self.hits() + self.misses()
135 }
136
137 #[must_use]
139 #[allow(clippy::cast_precision_loss)]
140 pub fn hit_rate(&self) -> f64 {
141 let total = self.total_requests();
142 if total == 0 {
143 0.0
144 } else {
145 self.hits() as f64 / total as f64
146 }
147 }
148
149 #[must_use]
151 pub fn evictions(&self) -> u64 {
152 self.evictions.load(Ordering::Relaxed)
153 }
154
155 fn record_hit(&self) {
156 self.hits.fetch_add(1, Ordering::Relaxed);
157 }
158
159 fn record_miss(&self) {
160 self.misses.fetch_add(1, Ordering::Relaxed);
161 }
162
163 fn record_eviction(&self) {
164 self.evictions.fetch_add(1, Ordering::Relaxed);
165 }
166
167 pub fn reset(&self) {
169 self.hits.store(0, Ordering::Relaxed);
170 self.misses.store(0, Ordering::Relaxed);
171 self.evictions.store(0, Ordering::Relaxed);
172 }
173}
174
175struct CacheEntry {
177 tokens: Vec<CachedToken>,
179 last_access: u64,
181}
182
183pub struct TokenCache {
185 config: CacheConfig,
186 entries: RwLock<HashMap<CacheKey, CacheEntry>>,
187 stats: CacheStats,
188 access_counter: AtomicU64,
189}
190
191impl TokenCache {
192 #[must_use]
194 pub fn new(config: CacheConfig) -> Self {
195 Self {
196 config,
197 entries: RwLock::new(HashMap::new()),
198 stats: CacheStats::default(),
199 access_counter: AtomicU64::new(0),
200 }
201 }
202
203 #[must_use]
205 pub fn with_defaults() -> Self {
206 Self::new(CacheConfig::default())
207 }
208
209 #[must_use]
211 pub fn make_key(&self, text: &str) -> CacheKey {
212 let mut hasher = DefaultHasher::new();
213 text.hash(&mut hasher);
214 hasher.finish()
215 }
216
217 #[must_use]
219 pub fn get(&self, key: CacheKey) -> Option<Vec<CachedToken>> {
220 let mut entries = self.entries.write().ok()?;
221
222 if let Some(entry) = entries.get_mut(&key) {
223 entry.last_access = self.access_counter.fetch_add(1, Ordering::Relaxed);
224 if self.config.track_stats {
225 self.stats.record_hit();
226 }
227 Some(entry.tokens.clone())
228 } else {
229 if self.config.track_stats {
230 self.stats.record_miss();
231 }
232 None
233 }
234 }
235
236 pub fn insert(&self, key: CacheKey, tokens: Vec<CachedToken>) {
238 let Ok(mut entries) = self.entries.write() else {
239 return;
240 };
241
242 while entries.len() >= self.config.max_entries {
244 self.evict_lru(&mut entries);
245 }
246
247 let access = self.access_counter.fetch_add(1, Ordering::Relaxed);
248 entries.insert(
249 key,
250 CacheEntry {
251 tokens,
252 last_access: access,
253 },
254 );
255 }
256
257 pub fn get_or_insert<F>(&self, key: CacheKey, compute: F) -> Vec<CachedToken>
259 where
260 F: FnOnce() -> Vec<CachedToken>,
261 {
262 if let Some(tokens) = self.get(key) {
264 return tokens;
265 }
266
267 let tokens = compute();
269 self.insert(key, tokens.clone());
270 tokens
271 }
272
273 pub fn get_or_insert_with_text<F>(&self, text: &str, compute: F) -> Vec<CachedToken>
275 where
276 F: FnOnce() -> Vec<CachedToken>,
277 {
278 if text.len() > self.config.max_key_length {
280 return compute();
281 }
282
283 let key = self.make_key(text);
284 self.get_or_insert(key, compute)
285 }
286
287 fn evict_lru(&self, entries: &mut HashMap<CacheKey, CacheEntry>) {
289 if entries.is_empty() {
290 return;
291 }
292
293 let oldest_key = entries
295 .iter()
296 .min_by_key(|(_, entry)| entry.last_access)
297 .map(|(key, _)| *key);
298
299 if let Some(key) = oldest_key {
300 entries.remove(&key);
301 if self.config.track_stats {
302 self.stats.record_eviction();
303 }
304 }
305 }
306
307 pub fn clear(&self) {
309 if let Ok(mut entries) = self.entries.write() {
310 entries.clear();
311 }
312 }
313
314 #[must_use]
316 pub fn len(&self) -> usize {
317 self.entries.read().map_or(0, |e| e.len())
318 }
319
320 #[must_use]
322 pub fn is_empty(&self) -> bool {
323 self.len() == 0
324 }
325
326 #[must_use]
328 pub const fn stats(&self) -> &CacheStats {
329 &self.stats
330 }
331
332 #[must_use]
334 pub const fn config(&self) -> &CacheConfig {
335 &self.config
336 }
337}
338
339impl Default for TokenCache {
340 fn default() -> Self {
341 Self::with_defaults()
342 }
343}
344
345pub struct CachingTokenizer<T> {
347 inner: T,
348 cache: TokenCache,
349}
350
351impl<T> CachingTokenizer<T> {
352 pub fn new(inner: T, config: CacheConfig) -> Self {
354 Self {
355 inner,
356 cache: TokenCache::new(config),
357 }
358 }
359
360 pub fn with_defaults(inner: T) -> Self {
362 Self::new(inner, CacheConfig::default())
363 }
364
365 #[must_use]
367 pub const fn inner(&self) -> &T {
368 &self.inner
369 }
370
371 pub fn inner_mut(&mut self) -> &mut T {
373 &mut self.inner
374 }
375
376 #[must_use]
378 pub const fn cache(&self) -> &TokenCache {
379 &self.cache
380 }
381
382 #[must_use]
384 pub const fn stats(&self) -> &CacheStats {
385 self.cache.stats()
386 }
387
388 pub fn clear_cache(&self) {
390 self.cache.clear();
391 }
392}
393
394#[cfg(test)]
395mod tests {
396 use super::*;
397
398 #[test]
399 fn test_cache_config_default() {
400 let config = CacheConfig::default();
401 assert_eq!(config.max_entries, 10_000);
402 assert_eq!(config.max_key_length, 1024);
403 assert!(config.track_stats);
404 }
405
406 #[test]
407 fn test_cache_config_builder() {
408 let config = CacheConfig::new()
409 .with_max_entries(1000)
410 .with_max_key_length(512)
411 .with_track_stats(false);
412
413 assert_eq!(config.max_entries, 1000);
414 assert_eq!(config.max_key_length, 512);
415 assert!(!config.track_stats);
416 }
417
418 #[test]
419 fn test_cache_basic_operations() {
420 let cache = TokenCache::with_defaults();
421
422 let key = cache.make_key("테스트");
423
424 assert!(cache.get(key).is_none());
426 assert_eq!(cache.stats().misses(), 1);
427
428 let tokens = vec![CachedToken {
430 surface: "테스트".to_string(),
431 pos: "NNG".to_string(),
432 start_byte: 0,
433 end_byte: 9,
434 }];
435 cache.insert(key, tokens);
436
437 let cached = cache.get(key).unwrap();
439 assert_eq!(cached.len(), 1);
440 assert_eq!(cached[0].surface, "테스트");
441 assert_eq!(cache.stats().hits(), 1);
442 }
443
444 #[test]
445 fn test_cache_get_or_insert() {
446 let cache = TokenCache::with_defaults();
447
448 let key = cache.make_key("안녕");
449 let mut call_count = 0;
450
451 let tokens1 = cache.get_or_insert(key, || {
453 call_count += 1;
454 vec![CachedToken {
455 surface: "안녕".to_string(),
456 pos: "IC".to_string(),
457 start_byte: 0,
458 end_byte: 6,
459 }]
460 });
461 assert_eq!(call_count, 1);
462 assert_eq!(tokens1.len(), 1);
463
464 let tokens2 = cache.get_or_insert(key, || {
466 call_count += 1;
467 vec![]
468 });
469 assert_eq!(call_count, 1); assert_eq!(tokens2.len(), 1);
471 }
472
473 #[test]
474 fn test_cache_lru_eviction() {
475 let config = CacheConfig::new().with_max_entries(3);
476 let cache = TokenCache::new(config);
477
478 for i in 0..3 {
480 let key = cache.make_key(&format!("text{i}"));
481 cache.insert(key, vec![]);
482 }
483 assert_eq!(cache.len(), 3);
484
485 let key0 = cache.make_key("text0");
487 let _ = cache.get(key0);
488
489 let key3 = cache.make_key("text3");
491 cache.insert(key3, vec![]);
492 assert_eq!(cache.len(), 3);
493 assert_eq!(cache.stats().evictions(), 1);
494
495 assert!(cache.get(key0).is_some());
497
498 let key1 = cache.make_key("text1");
500 assert!(cache.get(key1).is_none());
501 }
502
503 #[test]
504 fn test_cache_stats() {
505 let cache = TokenCache::with_defaults();
506
507 let key = cache.make_key("test");
508
509 let _ = cache.get(key);
511 assert_eq!(cache.stats().misses(), 1);
512 assert_eq!(cache.stats().hits(), 0);
513 assert!((cache.stats().hit_rate() - 0.0).abs() < f64::EPSILON);
514
515 cache.insert(key, vec![]);
517 let _ = cache.get(key);
518 assert_eq!(cache.stats().hits(), 1);
519 assert!((cache.stats().hit_rate() - 0.5).abs() < f64::EPSILON);
520
521 cache.stats().reset();
523 assert_eq!(cache.stats().total_requests(), 0);
524 }
525
526 #[test]
527 fn test_cache_clear() {
528 let cache = TokenCache::with_defaults();
529
530 for i in 0..10 {
531 let key = cache.make_key(&format!("text{i}"));
532 cache.insert(key, vec![]);
533 }
534 assert_eq!(cache.len(), 10);
535
536 cache.clear();
537 assert_eq!(cache.len(), 0);
538 assert!(cache.is_empty());
539 }
540
541 #[test]
542 fn test_cache_skip_long_text() {
543 let config = CacheConfig::new().with_max_key_length(10);
544 let cache = TokenCache::new(config);
545
546 let mut call_count = 0;
547
548 let short = "짧은";
550 cache.get_or_insert_with_text(short, || {
551 call_count += 1;
552 vec![]
553 });
554 cache.get_or_insert_with_text(short, || {
555 call_count += 1;
556 vec![]
557 });
558 assert_eq!(call_count, 1);
559
560 let long = "이것은 아주 긴 텍스트입니다";
562 cache.get_or_insert_with_text(long, || {
563 call_count += 1;
564 vec![]
565 });
566 cache.get_or_insert_with_text(long, || {
567 call_count += 1;
568 vec![]
569 });
570 assert_eq!(call_count, 3); }
572
573 #[test]
574 fn test_caching_tokenizer() {
575 struct DummyTokenizer;
576
577 let caching = CachingTokenizer::with_defaults(DummyTokenizer);
578
579 assert!(caching.cache().is_empty());
580 assert_eq!(caching.stats().total_requests(), 0);
581
582 let key = caching.cache().make_key("test");
584 caching.cache().insert(key, vec![]);
585
586 assert_eq!(caching.cache().len(), 1);
587
588 caching.clear_cache();
590 assert!(caching.cache().is_empty());
591 }
592}