Skip to main content

mcp_memory/
intern.rs

1use ahash::RandomState;
2use std::fmt;
3
4// Ctrl-byte sentinels for the dedup hash table.
5const EMPTY: u8 = 0xFF;
6// Stored h2 values are 0x00-0x7F — never collide with 0xFF.
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
9#[repr(transparent)]
10pub struct StrId(u32);
11
12impl StrId {
13    pub const EMPTY: StrId = StrId(u32::MAX);
14
15    #[inline]
16    pub const fn is_empty(self) -> bool {
17        self.0 == u32::MAX
18    }
19
20    #[inline]
21    pub const fn as_u32(self) -> u32 {
22        self.0
23    }
24}
25
26impl fmt::Display for StrId {
27    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28        if self.is_empty() {
29            write!(f, "<empty>")
30        } else {
31            write!(f, "StrId({})", self.0)
32        }
33    }
34}
35
36/// A 7-bit hash stamp extracted from the full 64-bit hash.
37/// Always has bit 7 clear, so it never collides with `EMPTY` (0xFF).
38#[inline(always)]
39const fn h2(hash: u64) -> u8 {
40    (hash & 0x7F) as u8
41}
42
43/// Starting bucket index derived from the upper bits of the hash.
44#[inline(always)]
45const fn h1(hash: u64, mask: usize) -> usize {
46    ((hash >> 7) as usize) & mask
47}
48
49#[derive(Clone)]
50pub struct StringInterner {
51    arena: String,
52    offsets: Vec<u32>,
53    // Dedup hash table — ctrl-byte buckets with parallel arrays.
54    ctrl: Vec<u8>,
55    table_hashes: Vec<u64>,
56    table_ids: Vec<StrId>,
57    table_mask: usize,
58    count: usize,
59    hasher: RandomState,
60}
61
62impl StringInterner {
63    pub fn new() -> Self {
64        const CAP: usize = 256;
65        Self {
66            arena: String::with_capacity(8192),
67            offsets: vec![0],
68            ctrl: vec![EMPTY; CAP],
69            table_hashes: vec![0; CAP],
70            table_ids: vec![StrId::EMPTY; CAP],
71            table_mask: CAP - 1,
72            count: 0,
73            hasher: RandomState::new(),
74        }
75    }
76
77    pub fn with_capacity(string_capacity: usize, estimated_strings: usize) -> Self {
78        let cap = estimated_strings.next_power_of_two().max(64);
79        Self {
80            arena: String::with_capacity(string_capacity),
81            offsets: vec![0],
82            ctrl: vec![EMPTY; cap],
83            table_hashes: vec![0; cap],
84            table_ids: vec![StrId::EMPTY; cap],
85            table_mask: cap - 1,
86            count: 0,
87            hasher: RandomState::new(),
88        }
89    }
90
91    #[inline]
92    pub fn intern(&mut self, s: &str) -> StrId {
93        if s.is_empty() {
94            return StrId::EMPTY;
95        }
96        let hash = self.hasher.hash_one(s);
97        let stamp = h2(hash);
98        let mask = self.table_mask;
99        let mut idx = h1(hash, mask);
100
101        loop {
102            let c = &self.ctrl[idx];
103            if *c & 0x80 != 0 {
104                // Empty slot → insert new string.
105                let id = self.offsets.len() as u32 - 1;
106                self.arena.push_str(s);
107                self.offsets.push(self.arena.len() as u32);
108                self.ctrl[idx] = stamp;
109                self.table_hashes[idx] = hash;
110                self.table_ids[idx] = StrId(id);
111                self.count += 1;
112                if self.count * 4 > self.ctrl.len() * 3 {
113                    self.grow();
114                }
115                return StrId(id);
116            }
117            if *c == stamp && self.table_hashes[idx] == hash {
118                let existing = self.table_ids[idx].0;
119                let start = self.offsets[existing as usize] as usize;
120                let end = self.offsets[existing as usize + 1] as usize;
121                let existing_str = unsafe { self.arena.get_unchecked(start..end) };
122                if existing_str == s {
123                    return StrId(existing);
124                }
125            }
126            idx = (idx + 1) & mask;
127        }
128    }
129
130    /// Look up a string in the dedup table without inserting.
131    /// Returns `StrId` if the string already exists, `None` otherwise.
132    #[inline]
133    pub fn get_optional(&self, s: &str) -> Option<StrId> {
134        if s.is_empty() {
135            return Some(StrId::EMPTY);
136        }
137        let hash = self.hasher.hash_one(s);
138        let stamp = h2(hash);
139        let mask = self.table_mask;
140        let mut idx = h1(hash, mask);
141
142        for _ in 0..self.ctrl.len() {
143            let c = self.ctrl[idx];
144            if c & 0x80 != 0 {
145                return None;
146            }
147            if c == stamp && self.table_hashes[idx] == hash {
148                let existing = self.table_ids[idx].0;
149                let start = self.offsets[existing as usize] as usize;
150                let end = self.offsets[existing as usize + 1] as usize;
151                if unsafe { self.arena.get_unchecked(start..end) == s } {
152                    return Some(StrId(existing));
153                }
154            }
155            idx = (idx + 1) & mask;
156        }
157        None
158    }
159
160    #[inline]
161    pub fn lookup(&self, id: StrId) -> &str {
162        if id.is_empty() {
163            return "";
164        }
165        let start = self.offsets[id.0 as usize] as usize;
166        let end = self.offsets[id.0 as usize + 1] as usize;
167        unsafe { self.arena.get_unchecked(start..end) }
168    }
169
170    /// Recompute the hash of an interned string on demand. The per-string hash
171    /// is no longer stored (M4) — it is cheaply re-derived from the interned
172    /// bytes with the same hasher, saving 8 bytes per unique string. Used to
173    /// key the (separate) entity name table.
174    #[inline]
175    pub fn get_hash(&self, id: StrId) -> u64 {
176        if id.is_empty() {
177            return 0;
178        }
179        self.hasher.hash_one(self.lookup(id))
180    }
181
182    pub const fn len(&self) -> usize {
183        self.offsets.len() - 1
184    }
185
186    pub const fn is_empty(&self) -> bool {
187        self.len() == 0
188    }
189
190    pub const fn total_bytes(&self) -> usize {
191        self.arena.len()
192    }
193
194    fn grow(&mut self) {
195        let new_size = self.ctrl.len() * 2;
196        let new_mask = new_size - 1;
197        let mut new_ctrl = vec![EMPTY; new_size];
198        let mut new_hashes = vec![0u64; new_size];
199        let mut new_ids = vec![StrId::EMPTY; new_size];
200
201        for i in 0..self.ctrl.len() {
202            if self.ctrl[i] & 0x80 == 0 {
203                let hash = self.table_hashes[i];
204                let stamp = h2(hash);
205                let mut idx = h1(hash, new_mask);
206                while new_ctrl[idx] & 0x80 == 0 {
207                    idx = (idx + 1) & new_mask;
208                }
209                new_ctrl[idx] = stamp;
210                new_hashes[idx] = hash;
211                new_ids[idx] = self.table_ids[i];
212            }
213        }
214
215        self.ctrl = new_ctrl;
216        self.table_hashes = new_hashes;
217        self.table_ids = new_ids;
218        self.table_mask = new_mask;
219    }
220}
221
222impl Default for StringInterner {
223    fn default() -> Self {
224        Self::new()
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn test_intern_empty() {
234        let mut interner = StringInterner::new();
235        assert!(interner.intern("").is_empty());
236    }
237
238    #[test]
239    fn test_intern_dedup() {
240        let mut interner = StringInterner::new();
241        let a = interner.intern("hello");
242        let b = interner.intern("hello");
243        assert_eq!(a, b);
244    }
245
246    #[test]
247    fn test_intern_unique() {
248        let mut interner = StringInterner::new();
249        let a = interner.intern("hello");
250        let b = interner.intern("world");
251        assert_ne!(a, b);
252    }
253
254    #[test]
255    fn test_lookup() {
256        let mut interner = StringInterner::new();
257        let id = interner.intern("hello world");
258        assert_eq!(interner.lookup(id), "hello world");
259    }
260
261    #[test]
262    fn test_large_intern() {
263        let mut interner = StringInterner::new();
264        let mut ids = Vec::new();
265        for i in 0..1000 {
266            let s = format!("string_{i}");
267            ids.push(interner.intern(&s));
268        }
269        for (i, &id) in ids.iter().enumerate() {
270            let expected = format!("string_{i}");
271            assert_eq!(interner.lookup(id), expected);
272        }
273        assert_eq!(interner.len(), 1000);
274    }
275
276    #[test]
277    fn test_lookup_empty_id() {
278        let interner = StringInterner::new();
279        assert_eq!(interner.lookup(StrId::EMPTY), "");
280    }
281
282    #[test]
283    fn test_get_hash_empty_id() {
284        let interner = StringInterner::new();
285        assert_eq!(interner.get_hash(StrId::EMPTY), 0);
286    }
287
288    #[test]
289    fn test_get_hash_consistency() {
290        let mut interner = StringInterner::new();
291        let id = interner.intern("consistent");
292        let hash1 = interner.get_hash(id);
293        let id2 = interner.intern("consistent");
294        assert_eq!(id, id2);
295        let hash2 = interner.get_hash(id2);
296        assert_eq!(hash1, hash2);
297    }
298
299    #[test]
300    fn test_total_bytes() {
301        let mut interner = StringInterner::new();
302        assert_eq!(interner.total_bytes(), 0);
303        interner.intern("abc");
304        assert_eq!(interner.total_bytes(), 3);
305        interner.intern("defg");
306        assert_eq!(interner.total_bytes(), 7);
307        interner.intern("abc"); // dedup, no new bytes
308        assert_eq!(interner.total_bytes(), 7);
309    }
310
311    #[test]
312    fn test_intern_empty_via_lookup() {
313        let mut interner = StringInterner::new();
314        let e = interner.intern("");
315        assert!(e.is_empty());
316        // Interning empty again should still return EMPTY.
317        let e2 = interner.intern("");
318        assert!(e2.is_empty());
319    }
320
321    #[test]
322    fn test_grow_triggers() {
323        let mut interner = StringInterner::with_capacity(4096, 16);
324        // Insert enough strings to force a grow.
325        for i in 0..100 {
326            interner.intern(&format!("grow_test_{i}"));
327        }
328        assert_eq!(interner.len(), 100);
329        // Verify all strings are still accessible.
330        for i in 0..100 {
331            let id = interner.intern(&format!("grow_test_{i}"));
332            assert_eq!(interner.lookup(id), format!("grow_test_{i}"));
333        }
334    }
335
336    #[test]
337    fn test_many_dedup_same_string() {
338        let mut interner = StringInterner::new();
339        let id = interner.intern("same");
340        for _ in 0..1000 {
341            let new_id = interner.intern("same");
342            assert_eq!(new_id, id);
343        }
344        assert_eq!(interner.len(), 1);
345    }
346
347    #[test]
348    fn test_interner_with_capacity() {
349        let mut interner = StringInterner::with_capacity(1024, 50);
350        assert_eq!(interner.len(), 0);
351        for i in 0..50 {
352            interner.intern(&format!("cap_test_{i}"));
353        }
354        assert_eq!(interner.len(), 50);
355    }
356
357    #[test]
358    fn test_case_sensitive_dedup() {
359        let mut interner = StringInterner::new();
360        let a = interner.intern("Hello");
361        let b = interner.intern("hello");
362        assert_ne!(a, b); // case-sensitive dedup
363    }
364
365    #[test]
366    fn test_default_impl() {
367        let mut interner: StringInterner = Default::default();
368        let id = interner.intern("default");
369        assert_eq!(interner.lookup(id), "default");
370    }
371
372    #[test]
373    fn test_is_empty_method() {
374        let mut interner = StringInterner::new();
375        assert!(interner.is_empty());
376        interner.intern("x");
377        assert!(!interner.is_empty());
378    }
379}