use std::collections::HashMap;
use std::sync::atomic::{AtomicUsize, Ordering};
use parking_lot::RwLock;
#[derive(Debug)]
pub struct PosTagInterner {
tags: RwLock<HashMap<String, u16>>,
reverse: RwLock<Vec<String>>,
intern_count: AtomicUsize,
hit_count: AtomicUsize,
}
impl PosTagInterner {
#[must_use]
pub fn new() -> Self {
let interner = Self {
tags: RwLock::new(HashMap::with_capacity(64)),
reverse: RwLock::new(Vec::with_capacity(64)),
intern_count: AtomicUsize::new(0),
hit_count: AtomicUsize::new(0),
};
for tag in COMMON_POS_TAGS {
interner.intern(tag);
}
interner
}
#[allow(clippy::significant_drop_tightening)]
pub fn intern(&self, tag: &str) -> u16 {
self.intern_count.fetch_add(1, Ordering::Relaxed);
{
let tags = self.tags.read();
if let Some(&idx) = tags.get(tag) {
self.hit_count.fetch_add(1, Ordering::Relaxed);
return idx;
}
}
let mut tags = self.tags.write();
let mut reverse = self.reverse.write();
if let Some(&idx) = tags.get(tag) {
self.hit_count.fetch_add(1, Ordering::Relaxed);
return idx;
}
let idx = u16::try_from(reverse.len()).unwrap_or(u16::MAX);
tags.insert(tag.to_string(), idx);
reverse.push(tag.to_string());
idx
}
#[must_use]
pub fn resolve(&self, idx: u16) -> Option<String> {
let reverse = self.reverse.read();
reverse.get(idx as usize).cloned()
}
pub fn resolve_ref<F, R>(&self, idx: u16, f: F) -> Option<R>
where
F: FnOnce(&str) -> R,
{
let reverse = self.reverse.read();
reverse.get(idx as usize).map(|s| f(s.as_str()))
}
#[must_use]
pub fn len(&self) -> usize {
self.reverse.read().len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.reverse.read().is_empty()
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn stats(&self) -> InternerStats {
let intern_count = self.intern_count.load(Ordering::Relaxed);
let hit_count = self.hit_count.load(Ordering::Relaxed);
InternerStats {
unique_tags: self.len(),
intern_calls: intern_count,
cache_hits: hit_count,
hit_rate: if intern_count > 0 {
hit_count as f64 / intern_count as f64
} else {
0.0
},
}
}
#[must_use]
#[allow(clippy::significant_drop_tightening)]
pub fn memory_usage(&self) -> usize {
let reverse = self.reverse.read();
let tags = self.tags.read();
let vec_overhead = reverse.capacity() * std::mem::size_of::<String>();
let string_bytes: usize = reverse.iter().map(String::len).sum();
let map_overhead = tags.capacity() * (std::mem::size_of::<String>() + 2);
vec_overhead + string_bytes + map_overhead
}
}
impl Default for PosTagInterner {
fn default() -> Self {
Self::new()
}
}
const COMMON_POS_TAGS: &[&str] = &[
"NNG", "NNP", "NNB", "NR", "NP", "VV", "VA", "VX", "VCP", "VCN", "MM", "MAG", "MAJ", "IC", "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC", "EP", "EF", "EC", "ETN", "ETM", "XPN", "XSN", "XSV", "XSA", "XR", "SF", "SE", "SS", "SP", "SO", "SL", "SH", "SN", "SW", "NA", "UNK", "UNKNOWN", "*", "NNBC",
];
#[derive(Debug, Clone, Copy)]
pub struct InternerStats {
pub unique_tags: usize,
pub intern_calls: usize,
pub cache_hits: usize,
pub hit_rate: f64,
}
impl InternerStats {
#[must_use]
pub fn format(&self) -> String {
format!(
"POS Interner: {} unique tags, {} calls, {:.1}% hit rate",
self.unique_tags,
self.intern_calls,
self.hit_rate * 100.0
)
}
}
#[derive(Debug, Clone, Default)]
pub struct MemoryStats {
pub dictionary_bytes: usize,
pub lattice_bytes: usize,
pub pool_bytes: usize,
pub cache_bytes: usize,
pub interner_bytes: usize,
pub token_bytes: usize,
}
impl MemoryStats {
#[must_use]
pub const fn estimate_total(&self) -> usize {
self.dictionary_bytes
+ self.lattice_bytes
+ self.pool_bytes
+ self.cache_bytes
+ self.interner_bytes
+ self.token_bytes
}
#[must_use]
pub fn format_human_readable(&self) -> String {
format!(
"Memory Usage:\n\
- Dictionary: {} KB\n\
- Lattice: {} KB\n\
- Pool: {} KB\n\
- Cache: {} KB\n\
- Interner: {} KB\n\
- Tokens: {} KB\n\
- Total: {} KB",
self.dictionary_bytes / 1024,
self.lattice_bytes / 1024,
self.pool_bytes / 1024,
self.cache_bytes / 1024,
self.interner_bytes / 1024,
self.token_bytes / 1024,
self.estimate_total() / 1024
)
}
}
#[derive(Debug)]
pub struct FeatureCache {
features: RwLock<HashMap<String, u32>>,
reverse: RwLock<Vec<String>>,
max_size: usize,
}
impl FeatureCache {
#[must_use]
pub fn new(max_size: usize) -> Self {
Self {
features: RwLock::new(HashMap::with_capacity(max_size.min(10000))),
reverse: RwLock::new(Vec::with_capacity(max_size.min(10000))),
max_size,
}
}
#[allow(clippy::significant_drop_tightening)]
pub fn intern(&self, feature: &str) -> Option<u32> {
{
let features = self.features.read();
if let Some(&idx) = features.get(feature) {
return Some(idx);
}
}
let len = self.reverse.read().len();
if len >= self.max_size {
return None;
}
let mut features = self.features.write();
let mut reverse = self.reverse.write();
if let Some(&idx) = features.get(feature) {
return Some(idx);
}
if reverse.len() >= self.max_size {
return None;
}
let idx = u32::try_from(reverse.len()).ok()?;
features.insert(feature.to_string(), idx);
reverse.push(feature.to_string());
Some(idx)
}
#[must_use]
pub fn resolve(&self, idx: u32) -> Option<String> {
self.reverse.read().get(idx as usize).cloned()
}
#[must_use]
pub fn len(&self) -> usize {
self.reverse.read().len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.reverse.read().is_empty()
}
#[must_use]
#[allow(clippy::significant_drop_tightening)]
pub fn memory_usage(&self) -> usize {
let reverse = self.reverse.read();
let features = self.features.read();
let vec_bytes: usize = reverse.iter().map(String::len).sum();
let map_overhead = features.capacity() * (std::mem::size_of::<String>() + 4);
vec_bytes + map_overhead
}
}
impl Default for FeatureCache {
fn default() -> Self {
Self::new(50000)
}
}
#[must_use]
pub fn estimate_tokens_memory(tokens: &[crate::tokenizer::Token]) -> usize {
let base_size = std::mem::size_of_val(tokens);
let string_bytes: usize = tokens
.iter()
.map(|t| {
t.surface.len()
+ t.pos.len()
+ t.features.len()
+ t.reading.as_ref().map_or(0, String::len)
+ t.lemma.as_ref().map_or(0, String::len)
+ t.normalized.as_ref().map_or(0, String::len)
})
.sum();
base_size + string_bytes
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pos_tag_interner() {
let interner = PosTagInterner::new();
let idx1 = interner.intern("NNG");
let idx2 = interner.intern("NNG");
assert_eq!(idx1, idx2);
let idx3 = interner.intern("CUSTOM_TAG");
assert_ne!(idx1, idx3);
assert_eq!(interner.resolve(idx1), Some("NNG".to_string()));
assert_eq!(interner.resolve(idx3), Some("CUSTOM_TAG".to_string()));
}
#[test]
fn test_pos_interner_stats() {
let interner = PosTagInterner::new();
for _ in 0..100 {
interner.intern("NNG");
interner.intern("VV");
}
let stats = interner.stats();
assert!(stats.unique_tags > 0);
assert!(stats.intern_calls > 200); assert!(stats.hit_rate > 0.75, "hit_rate: {}", stats.hit_rate);
}
#[test]
fn test_feature_cache() {
let cache = FeatureCache::new(100);
let idx1 = cache.intern("NNG,*,T,테스트,*,*,*,*");
assert!(idx1.is_some());
let idx2 = cache.intern("NNG,*,T,테스트,*,*,*,*");
assert_eq!(idx1, idx2);
assert_eq!(
cache.resolve(idx1.unwrap()),
Some("NNG,*,T,테스트,*,*,*,*".to_string())
);
}
#[test]
fn test_feature_cache_max_size() {
let cache = FeatureCache::new(2);
assert!(cache.intern("feature1").is_some());
assert!(cache.intern("feature2").is_some());
assert!(cache.intern("feature3").is_none());
}
#[test]
fn test_memory_stats_format() {
let stats = MemoryStats {
dictionary_bytes: 100 * 1024,
lattice_bytes: 10 * 1024,
pool_bytes: 5 * 1024,
cache_bytes: 20 * 1024,
interner_bytes: 1024,
token_bytes: 2 * 1024,
};
let formatted = stats.format_human_readable();
assert!(formatted.contains("Dictionary: 100 KB"));
assert!(formatted.contains("Total: 138 KB"));
}
#[test]
fn test_common_pos_tags_preloaded() {
let interner = PosTagInterner::new();
assert!(interner.len() > 30);
for tag in COMMON_POS_TAGS {
let idx = interner.intern(tag);
assert!(idx < 100);
}
}
}