Skip to main content

mecab_ko_dict/
string_pool.rs

1//! String pooling utilities for memory-efficient string storage.
2//!
3//! This module provides a string pool that deduplicates strings, reducing
4//! memory usage when many dictionary entries share the same surface forms
5//! or features.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use mecab_ko_dict::string_pool::StringPool;
11//!
12//! let mut pool = StringPool::new();
13//!
14//! // Intern strings - identical strings return the same Arc
15//! let s1 = pool.intern("안녕하세요");
16//! let s2 = pool.intern("안녕하세요");
17//!
18//! // s1 and s2 point to the same allocation
19//! assert!(std::sync::Arc::ptr_eq(&s1, &s2));
20//!
21//! // Check pool statistics
22//! assert_eq!(pool.len(), 1);
23//! ```
24
25use std::collections::HashMap;
26use std::sync::Arc;
27
28/// A string pool that deduplicates strings using reference counting.
29///
30/// Strings are stored as `Arc<str>`, allowing multiple references to
31/// the same string without duplication. The pool maintains a weak
32/// reference to each string, allowing unused strings to be garbage
33/// collected.
34#[derive(Debug, Default)]
35pub struct StringPool {
36    /// Interned strings (strong references).
37    strings: HashMap<Arc<str>, ()>,
38}
39
40impl StringPool {
41    /// Creates a new empty string pool.
42    #[must_use]
43    pub fn new() -> Self {
44        Self {
45            strings: HashMap::new(),
46        }
47    }
48
49    /// Creates a string pool with the specified capacity.
50    #[must_use]
51    pub fn with_capacity(capacity: usize) -> Self {
52        Self {
53            strings: HashMap::with_capacity(capacity),
54        }
55    }
56
57    /// Interns a string, returning a reference-counted handle.
58    ///
59    /// If the string already exists in the pool, returns a clone of
60    /// the existing `Arc<str>`. Otherwise, creates a new entry.
61    pub fn intern(&mut self, s: &str) -> Arc<str> {
62        // Check if already interned
63        if let Some((existing, ())) = self.strings.get_key_value(s) {
64            return Arc::clone(existing);
65        }
66
67        // Create new interned string
68        let arc: Arc<str> = Arc::from(s);
69        self.strings.insert(Arc::clone(&arc), ());
70        arc
71    }
72
73    /// Interns a string from an owned String.
74    ///
75    /// This is more efficient than `intern` when you already have a String,
76    /// as it avoids an extra allocation if the string is not already pooled.
77    pub fn intern_string(&mut self, s: String) -> Arc<str> {
78        // Check if already interned
79        if let Some((existing, ())) = self.strings.get_key_value(s.as_str()) {
80            return Arc::clone(existing);
81        }
82
83        // Create new interned string from owned String
84        let arc: Arc<str> = Arc::from(s);
85        self.strings.insert(Arc::clone(&arc), ());
86        arc
87    }
88
89    /// Returns the number of unique strings in the pool.
90    #[must_use]
91    pub fn len(&self) -> usize {
92        self.strings.len()
93    }
94
95    /// Returns true if the pool is empty.
96    #[must_use]
97    pub fn is_empty(&self) -> bool {
98        self.strings.is_empty()
99    }
100
101    /// Clears all strings from the pool.
102    pub fn clear(&mut self) {
103        self.strings.clear();
104    }
105
106    /// Returns the memory usage of the pool in bytes (approximate).
107    ///
108    /// This includes the `HashMap` overhead and the string data.
109    #[must_use]
110    pub fn memory_usage(&self) -> usize {
111        let mut total = std::mem::size_of::<Self>();
112
113        // HashMap overhead (approximate)
114        total += self.strings.capacity() * std::mem::size_of::<(Arc<str>, ())>();
115
116        // String data
117        for (s, ()) in &self.strings {
118            total += std::mem::size_of::<Arc<str>>() + s.len();
119        }
120
121        total
122    }
123
124    /// Returns statistics about the string pool.
125    #[must_use]
126    pub fn stats(&self) -> StringPoolStats {
127        let mut total_string_bytes = 0;
128        let mut min_len = usize::MAX;
129        let mut max_len = 0;
130
131        for (s, ()) in &self.strings {
132            let len = s.len();
133            total_string_bytes += len;
134            min_len = min_len.min(len);
135            max_len = max_len.max(len);
136        }
137
138        if self.strings.is_empty() {
139            min_len = 0;
140        }
141
142        StringPoolStats {
143            count: self.strings.len(),
144            total_bytes: total_string_bytes,
145            avg_length: if self.strings.is_empty() {
146                0.0
147            } else {
148                total_string_bytes as f64 / self.strings.len() as f64
149            },
150            min_length: min_len,
151            max_length: max_len,
152            memory_usage: self.memory_usage(),
153        }
154    }
155}
156
157/// Statistics about a string pool.
158#[derive(Debug, Clone, Copy, PartialEq)]
159pub struct StringPoolStats {
160    /// Number of unique strings.
161    pub count: usize,
162    /// Total bytes of string data.
163    pub total_bytes: usize,
164    /// Average string length.
165    pub avg_length: f64,
166    /// Minimum string length.
167    pub min_length: usize,
168    /// Maximum string length.
169    pub max_length: usize,
170    /// Total memory usage (approximate).
171    pub memory_usage: usize,
172}
173
174/// Thread-safe string pool using a concurrent hashmap.
175///
176/// This version uses a `parking_lot` `RwLock` for thread-safe access.
177/// Suitable for multi-threaded dictionary building.
178#[derive(Debug, Default)]
179pub struct ConcurrentStringPool {
180    inner: std::sync::RwLock<StringPool>,
181}
182
183impl ConcurrentStringPool {
184    /// Creates a new empty concurrent string pool.
185    #[must_use]
186    pub fn new() -> Self {
187        Self {
188            inner: std::sync::RwLock::new(StringPool::new()),
189        }
190    }
191
192    /// Creates a concurrent string pool with the specified capacity.
193    #[must_use]
194    pub fn with_capacity(capacity: usize) -> Self {
195        Self {
196            inner: std::sync::RwLock::new(StringPool::with_capacity(capacity)),
197        }
198    }
199
200    /// Interns a string, returning a reference-counted handle.
201    pub fn intern(&self, s: &str) -> Arc<str> {
202        // First try read lock to check if exists
203        {
204            let pool = self.inner.read().unwrap();
205            if let Some((existing, ())) = pool.strings.get_key_value(s) {
206                return Arc::clone(existing);
207            }
208        }
209
210        // Need write lock to insert
211        let mut pool = self.inner.write().unwrap();
212        pool.intern(s)
213    }
214
215    /// Returns the number of unique strings in the pool.
216    #[must_use]
217    pub fn len(&self) -> usize {
218        self.inner.read().unwrap().len()
219    }
220
221    /// Returns true if the pool is empty.
222    #[must_use]
223    pub fn is_empty(&self) -> bool {
224        self.inner.read().unwrap().is_empty()
225    }
226
227    /// Returns statistics about the string pool.
228    #[must_use]
229    pub fn stats(&self) -> StringPoolStats {
230        self.inner.read().unwrap().stats()
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    #[test]
239    fn test_string_pool_basic() {
240        let mut pool = StringPool::new();
241
242        let s1 = pool.intern("hello");
243        let s2 = pool.intern("hello");
244        let s3 = pool.intern("world");
245
246        // Same strings should return the same Arc
247        assert!(Arc::ptr_eq(&s1, &s2));
248
249        // Different strings should be different
250        assert!(!Arc::ptr_eq(&s1, &s3));
251
252        // Pool should have 2 unique strings
253        assert_eq!(pool.len(), 2);
254    }
255
256    #[test]
257    fn test_string_pool_korean() {
258        let mut pool = StringPool::new();
259
260        let s1 = pool.intern("안녕하세요");
261        let s2 = pool.intern("안녕하세요");
262        let s3 = pool.intern("감사합니다");
263
264        assert!(Arc::ptr_eq(&s1, &s2));
265        assert!(!Arc::ptr_eq(&s1, &s3));
266        assert_eq!(pool.len(), 2);
267    }
268
269    #[test]
270    fn test_string_pool_stats() {
271        let mut pool = StringPool::new();
272
273        pool.intern("a");
274        pool.intern("bb");
275        pool.intern("ccc");
276
277        let stats = pool.stats();
278        assert_eq!(stats.count, 3);
279        assert_eq!(stats.total_bytes, 6); // 1 + 2 + 3
280        assert!((stats.avg_length - 2.0).abs() < f64::EPSILON);
281        assert_eq!(stats.min_length, 1);
282        assert_eq!(stats.max_length, 3);
283    }
284
285    #[test]
286    fn test_string_pool_intern_string() {
287        let mut pool = StringPool::new();
288
289        let owned = String::from("test");
290        let s1 = pool.intern_string(owned);
291        let s2 = pool.intern("test");
292
293        assert!(Arc::ptr_eq(&s1, &s2));
294        assert_eq!(pool.len(), 1);
295    }
296
297    #[test]
298    fn test_concurrent_string_pool() {
299        let pool = ConcurrentStringPool::new();
300
301        let s1 = pool.intern("test");
302        let s2 = pool.intern("test");
303
304        assert!(Arc::ptr_eq(&s1, &s2));
305        assert_eq!(pool.len(), 1);
306    }
307
308    #[test]
309    fn test_string_pool_memory_usage() {
310        let mut pool = StringPool::new();
311
312        let initial = pool.memory_usage();
313
314        pool.intern("a short string");
315        pool.intern("another string");
316
317        let after = pool.memory_usage();
318
319        // Memory usage should increase
320        assert!(after > initial);
321    }
322
323    #[test]
324    fn test_string_pool_clear() {
325        let mut pool = StringPool::new();
326
327        pool.intern("test1");
328        pool.intern("test2");
329        assert_eq!(pool.len(), 2);
330
331        pool.clear();
332        assert!(pool.is_empty());
333    }
334}