1use std::collections::HashMap;
25use std::sync::atomic::{AtomicUsize, Ordering};
26
27use parking_lot::RwLock;
28
29#[derive(Debug)]
34pub struct PosTagInterner {
35 tags: RwLock<HashMap<String, u16>>,
37 reverse: RwLock<Vec<String>>,
39 intern_count: AtomicUsize,
41 hit_count: AtomicUsize,
43}
44
45impl PosTagInterner {
46 #[must_use]
50 pub fn new() -> Self {
51 let interner = Self {
52 tags: RwLock::new(HashMap::with_capacity(64)),
53 reverse: RwLock::new(Vec::with_capacity(64)),
54 intern_count: AtomicUsize::new(0),
55 hit_count: AtomicUsize::new(0),
56 };
57
58 for tag in COMMON_POS_TAGS {
60 interner.intern(tag);
61 }
62
63 interner
64 }
65
66 #[allow(clippy::significant_drop_tightening)]
70 pub fn intern(&self, tag: &str) -> u16 {
71 self.intern_count.fetch_add(1, Ordering::Relaxed);
72
73 {
75 let tags = self.tags.read();
76 if let Some(&idx) = tags.get(tag) {
77 self.hit_count.fetch_add(1, Ordering::Relaxed);
78 return idx;
79 }
80 }
81
82 let mut tags = self.tags.write();
84 let mut reverse = self.reverse.write();
85
86 if let Some(&idx) = tags.get(tag) {
88 self.hit_count.fetch_add(1, Ordering::Relaxed);
89 return idx;
90 }
91
92 let idx = u16::try_from(reverse.len()).unwrap_or(u16::MAX);
93 tags.insert(tag.to_string(), idx);
94 reverse.push(tag.to_string());
95 idx
96 }
97
98 #[must_use]
100 pub fn resolve(&self, idx: u16) -> Option<String> {
101 let reverse = self.reverse.read();
102 reverse.get(idx as usize).cloned()
103 }
104
105 pub fn resolve_ref<F, R>(&self, idx: u16, f: F) -> Option<R>
107 where
108 F: FnOnce(&str) -> R,
109 {
110 let reverse = self.reverse.read();
111 reverse.get(idx as usize).map(|s| f(s.as_str()))
112 }
113
114 #[must_use]
116 pub fn len(&self) -> usize {
117 self.reverse.read().len()
118 }
119
120 #[must_use]
122 pub fn is_empty(&self) -> bool {
123 self.reverse.read().is_empty()
124 }
125
126 #[must_use]
128 #[allow(clippy::cast_precision_loss)]
129 pub fn stats(&self) -> InternerStats {
130 let intern_count = self.intern_count.load(Ordering::Relaxed);
131 let hit_count = self.hit_count.load(Ordering::Relaxed);
132 InternerStats {
133 unique_tags: self.len(),
134 intern_calls: intern_count,
135 cache_hits: hit_count,
136 hit_rate: if intern_count > 0 {
137 hit_count as f64 / intern_count as f64
138 } else {
139 0.0
140 },
141 }
142 }
143
144 #[must_use]
146 #[allow(clippy::significant_drop_tightening)]
147 pub fn memory_usage(&self) -> usize {
148 let reverse = self.reverse.read();
149 let tags = self.tags.read();
150
151 let vec_overhead = reverse.capacity() * std::mem::size_of::<String>();
153 let string_bytes: usize = reverse.iter().map(String::len).sum();
155 let map_overhead = tags.capacity() * (std::mem::size_of::<String>() + 2);
157
158 vec_overhead + string_bytes + map_overhead
159 }
160}
161
162impl Default for PosTagInterner {
163 fn default() -> Self {
164 Self::new()
165 }
166}
167
168const COMMON_POS_TAGS: &[&str] = &[
170 "NNG", "NNP", "NNB", "NR", "NP",
172 "VV", "VA", "VX", "VCP", "VCN",
174 "MM", "MAG", "MAJ",
176 "IC",
178 "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
180 "EP", "EF", "EC", "ETN", "ETM", "XPN", "XSN", "XSV", "XSA", "XR",
182 "SF", "SE", "SS", "SP", "SO", "SL", "SH", "SN", "SW",
184 "NA",
186 "UNK", "UNKNOWN",
188 "*", "NNBC",
190];
191
192#[derive(Debug, Clone, Copy)]
194pub struct InternerStats {
195 pub unique_tags: usize,
197 pub intern_calls: usize,
199 pub cache_hits: usize,
201 pub hit_rate: f64,
203}
204
205impl InternerStats {
206 #[must_use]
208 pub fn format(&self) -> String {
209 format!(
210 "POS Interner: {} unique tags, {} calls, {:.1}% hit rate",
211 self.unique_tags,
212 self.intern_calls,
213 self.hit_rate * 100.0
214 )
215 }
216}
217
218#[derive(Debug, Clone, Default)]
220pub struct MemoryStats {
221 pub dictionary_bytes: usize,
223 pub lattice_bytes: usize,
225 pub pool_bytes: usize,
227 pub cache_bytes: usize,
229 pub interner_bytes: usize,
231 pub token_bytes: usize,
233}
234
235impl MemoryStats {
236 #[must_use]
238 pub const fn estimate_total(&self) -> usize {
239 self.dictionary_bytes
240 + self.lattice_bytes
241 + self.pool_bytes
242 + self.cache_bytes
243 + self.interner_bytes
244 + self.token_bytes
245 }
246
247 #[must_use]
249 pub fn format_human_readable(&self) -> String {
250 format!(
251 "Memory Usage:\n\
252 - Dictionary: {} KB\n\
253 - Lattice: {} KB\n\
254 - Pool: {} KB\n\
255 - Cache: {} KB\n\
256 - Interner: {} KB\n\
257 - Tokens: {} KB\n\
258 - Total: {} KB",
259 self.dictionary_bytes / 1024,
260 self.lattice_bytes / 1024,
261 self.pool_bytes / 1024,
262 self.cache_bytes / 1024,
263 self.interner_bytes / 1024,
264 self.token_bytes / 1024,
265 self.estimate_total() / 1024
266 )
267 }
268}
269
270#[derive(Debug)]
275pub struct FeatureCache {
276 features: RwLock<HashMap<String, u32>>,
278 reverse: RwLock<Vec<String>>,
280 max_size: usize,
282}
283
284impl FeatureCache {
285 #[must_use]
287 pub fn new(max_size: usize) -> Self {
288 Self {
289 features: RwLock::new(HashMap::with_capacity(max_size.min(10000))),
290 reverse: RwLock::new(Vec::with_capacity(max_size.min(10000))),
291 max_size,
292 }
293 }
294
295 #[allow(clippy::significant_drop_tightening)]
299 pub fn intern(&self, feature: &str) -> Option<u32> {
300 {
302 let features = self.features.read();
303 if let Some(&idx) = features.get(feature) {
304 return Some(idx);
305 }
306 }
307
308 let len = self.reverse.read().len();
310 if len >= self.max_size {
311 return None;
312 }
313
314 let mut features = self.features.write();
316 let mut reverse = self.reverse.write();
317
318 if let Some(&idx) = features.get(feature) {
319 return Some(idx);
320 }
321
322 if reverse.len() >= self.max_size {
323 return None;
324 }
325
326 let idx = u32::try_from(reverse.len()).ok()?;
327 features.insert(feature.to_string(), idx);
328 reverse.push(feature.to_string());
329 Some(idx)
330 }
331
332 #[must_use]
334 pub fn resolve(&self, idx: u32) -> Option<String> {
335 self.reverse.read().get(idx as usize).cloned()
336 }
337
338 #[must_use]
340 pub fn len(&self) -> usize {
341 self.reverse.read().len()
342 }
343
344 #[must_use]
346 pub fn is_empty(&self) -> bool {
347 self.reverse.read().is_empty()
348 }
349
350 #[must_use]
352 #[allow(clippy::significant_drop_tightening)]
353 pub fn memory_usage(&self) -> usize {
354 let reverse = self.reverse.read();
355 let features = self.features.read();
356
357 let vec_bytes: usize = reverse.iter().map(String::len).sum();
358 let map_overhead = features.capacity() * (std::mem::size_of::<String>() + 4);
359
360 vec_bytes + map_overhead
361 }
362}
363
364impl Default for FeatureCache {
365 fn default() -> Self {
366 Self::new(50000)
367 }
368}
369
370#[must_use]
374pub fn estimate_tokens_memory(tokens: &[crate::tokenizer::Token]) -> usize {
375 let base_size = std::mem::size_of_val(tokens);
376 let string_bytes: usize = tokens
377 .iter()
378 .map(|t| {
379 t.surface.len()
380 + t.pos.len()
381 + t.features.len()
382 + t.reading.as_ref().map_or(0, String::len)
383 + t.lemma.as_ref().map_or(0, String::len)
384 + t.normalized.as_ref().map_or(0, String::len)
385 })
386 .sum();
387
388 base_size + string_bytes
389}
390
391#[cfg(test)]
392mod tests {
393 use super::*;
394
395 #[test]
396 fn test_pos_tag_interner() {
397 let interner = PosTagInterner::new();
398
399 let idx1 = interner.intern("NNG");
401 let idx2 = interner.intern("NNG");
402 assert_eq!(idx1, idx2);
403
404 let idx3 = interner.intern("CUSTOM_TAG");
406 assert_ne!(idx1, idx3);
407
408 assert_eq!(interner.resolve(idx1), Some("NNG".to_string()));
410 assert_eq!(interner.resolve(idx3), Some("CUSTOM_TAG".to_string()));
411 }
412
413 #[test]
414 fn test_pos_interner_stats() {
415 let interner = PosTagInterner::new();
416
417 for _ in 0..100 {
419 interner.intern("NNG");
420 interner.intern("VV");
421 }
422
423 let stats = interner.stats();
424 assert!(stats.unique_tags > 0);
425 assert!(stats.intern_calls > 200); assert!(stats.hit_rate > 0.75, "hit_rate: {}", stats.hit_rate);
428 }
429
430 #[test]
431 fn test_feature_cache() {
432 let cache = FeatureCache::new(100);
433
434 let idx1 = cache.intern("NNG,*,T,테스트,*,*,*,*");
435 assert!(idx1.is_some());
436
437 let idx2 = cache.intern("NNG,*,T,테스트,*,*,*,*");
438 assert_eq!(idx1, idx2);
439
440 assert_eq!(cache.resolve(idx1.unwrap()), Some("NNG,*,T,테스트,*,*,*,*".to_string()));
441 }
442
443 #[test]
444 fn test_feature_cache_max_size() {
445 let cache = FeatureCache::new(2);
446
447 assert!(cache.intern("feature1").is_some());
448 assert!(cache.intern("feature2").is_some());
449 assert!(cache.intern("feature3").is_none());
451 }
452
453 #[test]
454 fn test_memory_stats_format() {
455 let stats = MemoryStats {
456 dictionary_bytes: 100 * 1024,
457 lattice_bytes: 10 * 1024,
458 pool_bytes: 5 * 1024,
459 cache_bytes: 20 * 1024,
460 interner_bytes: 1 * 1024,
461 token_bytes: 2 * 1024,
462 };
463
464 let formatted = stats.format_human_readable();
465 assert!(formatted.contains("Dictionary: 100 KB"));
466 assert!(formatted.contains("Total: 138 KB"));
467 }
468
469 #[test]
470 fn test_common_pos_tags_preloaded() {
471 let interner = PosTagInterner::new();
472
473 assert!(interner.len() > 30);
475
476 for tag in COMMON_POS_TAGS {
478 let idx = interner.intern(tag);
479 assert!(idx < 100);
480 }
481 }
482}