1use ahash::RandomState;
2use std::fmt;
3
4const EMPTY: u8 = 0xFF;
6#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
9#[repr(transparent)]
10pub struct StrId(u32);
11
12impl StrId {
13 pub const EMPTY: StrId = StrId(u32::MAX);
14
15 #[inline]
16 pub const fn is_empty(self) -> bool {
17 self.0 == u32::MAX
18 }
19
20 #[inline]
21 pub const fn as_u32(self) -> u32 {
22 self.0
23 }
24}
25
26impl fmt::Display for StrId {
27 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28 if self.is_empty() {
29 write!(f, "<empty>")
30 } else {
31 write!(f, "StrId({})", self.0)
32 }
33 }
34}
35
36#[inline(always)]
39const fn h2(hash: u64) -> u8 {
40 (hash & 0x7F) as u8
41}
42
43#[inline(always)]
45const fn h1(hash: u64, mask: usize) -> usize {
46 ((hash >> 7) as usize) & mask
47}
48
49#[derive(Clone)]
50pub struct StringInterner {
51 arena: String,
52 offsets: Vec<u32>,
53 ctrl: Vec<u8>,
55 table_hashes: Vec<u64>,
56 table_ids: Vec<StrId>,
57 table_mask: usize,
58 count: usize,
59 hasher: RandomState,
60}
61
62impl StringInterner {
63 pub fn new() -> Self {
64 const CAP: usize = 256;
65 Self {
66 arena: String::with_capacity(8192),
67 offsets: vec![0],
68 ctrl: vec![EMPTY; CAP],
69 table_hashes: vec![0; CAP],
70 table_ids: vec![StrId::EMPTY; CAP],
71 table_mask: CAP - 1,
72 count: 0,
73 hasher: RandomState::new(),
74 }
75 }
76
77 pub fn with_capacity(string_capacity: usize, estimated_strings: usize) -> Self {
78 let cap = estimated_strings.next_power_of_two().max(64);
79 Self {
80 arena: String::with_capacity(string_capacity),
81 offsets: vec![0],
82 ctrl: vec![EMPTY; cap],
83 table_hashes: vec![0; cap],
84 table_ids: vec![StrId::EMPTY; cap],
85 table_mask: cap - 1,
86 count: 0,
87 hasher: RandomState::new(),
88 }
89 }
90
91 #[inline]
92 pub fn intern(&mut self, s: &str) -> StrId {
93 if s.is_empty() {
94 return StrId::EMPTY;
95 }
96 let hash = self.hasher.hash_one(s);
97 let stamp = h2(hash);
98 let mask = self.table_mask;
99 let mut idx = h1(hash, mask);
100
101 loop {
102 let c = &self.ctrl[idx];
103 if *c & 0x80 != 0 {
104 let id = self.offsets.len() as u32 - 1;
106 self.arena.push_str(s);
107 self.offsets.push(self.arena.len() as u32);
108 self.ctrl[idx] = stamp;
109 self.table_hashes[idx] = hash;
110 self.table_ids[idx] = StrId(id);
111 self.count += 1;
112 if self.count * 4 > self.ctrl.len() * 3 {
113 self.grow();
114 }
115 return StrId(id);
116 }
117 if *c == stamp && self.table_hashes[idx] == hash {
118 let existing = self.table_ids[idx].0;
119 let start = self.offsets[existing as usize] as usize;
120 let end = self.offsets[existing as usize + 1] as usize;
121 let existing_str = unsafe { self.arena.get_unchecked(start..end) };
122 if existing_str == s {
123 return StrId(existing);
124 }
125 }
126 idx = (idx + 1) & mask;
127 }
128 }
129
130 #[inline]
133 pub fn get_optional(&self, s: &str) -> Option<StrId> {
134 if s.is_empty() {
135 return Some(StrId::EMPTY);
136 }
137 let hash = self.hasher.hash_one(s);
138 let stamp = h2(hash);
139 let mask = self.table_mask;
140 let mut idx = h1(hash, mask);
141
142 for _ in 0..self.ctrl.len() {
143 let c = self.ctrl[idx];
144 if c & 0x80 != 0 {
145 return None;
146 }
147 if c == stamp && self.table_hashes[idx] == hash {
148 let existing = self.table_ids[idx].0;
149 let start = self.offsets[existing as usize] as usize;
150 let end = self.offsets[existing as usize + 1] as usize;
151 if unsafe { self.arena.get_unchecked(start..end) == s } {
152 return Some(StrId(existing));
153 }
154 }
155 idx = (idx + 1) & mask;
156 }
157 None
158 }
159
160 #[inline]
161 pub fn lookup(&self, id: StrId) -> &str {
162 if id.is_empty() {
163 return "";
164 }
165 let start = self.offsets[id.0 as usize] as usize;
166 let end = self.offsets[id.0 as usize + 1] as usize;
167 unsafe { self.arena.get_unchecked(start..end) }
168 }
169
170 #[inline]
175 pub fn get_hash(&self, id: StrId) -> u64 {
176 if id.is_empty() {
177 return 0;
178 }
179 self.hasher.hash_one(self.lookup(id))
180 }
181
182 pub const fn len(&self) -> usize {
183 self.offsets.len() - 1
184 }
185
186 pub const fn is_empty(&self) -> bool {
187 self.len() == 0
188 }
189
190 pub const fn total_bytes(&self) -> usize {
191 self.arena.len()
192 }
193
194 fn grow(&mut self) {
195 let new_size = self.ctrl.len() * 2;
196 let new_mask = new_size - 1;
197 let mut new_ctrl = vec![EMPTY; new_size];
198 let mut new_hashes = vec![0u64; new_size];
199 let mut new_ids = vec![StrId::EMPTY; new_size];
200
201 for i in 0..self.ctrl.len() {
202 if self.ctrl[i] & 0x80 == 0 {
203 let hash = self.table_hashes[i];
204 let stamp = h2(hash);
205 let mut idx = h1(hash, new_mask);
206 while new_ctrl[idx] & 0x80 == 0 {
207 idx = (idx + 1) & new_mask;
208 }
209 new_ctrl[idx] = stamp;
210 new_hashes[idx] = hash;
211 new_ids[idx] = self.table_ids[i];
212 }
213 }
214
215 self.ctrl = new_ctrl;
216 self.table_hashes = new_hashes;
217 self.table_ids = new_ids;
218 self.table_mask = new_mask;
219 }
220}
221
222impl Default for StringInterner {
223 fn default() -> Self {
224 Self::new()
225 }
226}
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231
232 #[test]
233 fn test_intern_empty() {
234 let mut interner = StringInterner::new();
235 assert!(interner.intern("").is_empty());
236 }
237
238 #[test]
239 fn test_intern_dedup() {
240 let mut interner = StringInterner::new();
241 let a = interner.intern("hello");
242 let b = interner.intern("hello");
243 assert_eq!(a, b);
244 }
245
246 #[test]
247 fn test_intern_unique() {
248 let mut interner = StringInterner::new();
249 let a = interner.intern("hello");
250 let b = interner.intern("world");
251 assert_ne!(a, b);
252 }
253
254 #[test]
255 fn test_lookup() {
256 let mut interner = StringInterner::new();
257 let id = interner.intern("hello world");
258 assert_eq!(interner.lookup(id), "hello world");
259 }
260
261 #[test]
262 fn test_large_intern() {
263 let mut interner = StringInterner::new();
264 let mut ids = Vec::new();
265 for i in 0..1000 {
266 let s = format!("string_{i}");
267 ids.push(interner.intern(&s));
268 }
269 for (i, &id) in ids.iter().enumerate() {
270 let expected = format!("string_{i}");
271 assert_eq!(interner.lookup(id), expected);
272 }
273 assert_eq!(interner.len(), 1000);
274 }
275
276 #[test]
277 fn test_lookup_empty_id() {
278 let interner = StringInterner::new();
279 assert_eq!(interner.lookup(StrId::EMPTY), "");
280 }
281
282 #[test]
283 fn test_get_hash_empty_id() {
284 let interner = StringInterner::new();
285 assert_eq!(interner.get_hash(StrId::EMPTY), 0);
286 }
287
288 #[test]
289 fn test_get_hash_consistency() {
290 let mut interner = StringInterner::new();
291 let id = interner.intern("consistent");
292 let hash1 = interner.get_hash(id);
293 let id2 = interner.intern("consistent");
294 assert_eq!(id, id2);
295 let hash2 = interner.get_hash(id2);
296 assert_eq!(hash1, hash2);
297 }
298
299 #[test]
300 fn test_total_bytes() {
301 let mut interner = StringInterner::new();
302 assert_eq!(interner.total_bytes(), 0);
303 interner.intern("abc");
304 assert_eq!(interner.total_bytes(), 3);
305 interner.intern("defg");
306 assert_eq!(interner.total_bytes(), 7);
307 interner.intern("abc"); assert_eq!(interner.total_bytes(), 7);
309 }
310
311 #[test]
312 fn test_intern_empty_via_lookup() {
313 let mut interner = StringInterner::new();
314 let e = interner.intern("");
315 assert!(e.is_empty());
316 let e2 = interner.intern("");
318 assert!(e2.is_empty());
319 }
320
321 #[test]
322 fn test_grow_triggers() {
323 let mut interner = StringInterner::with_capacity(4096, 16);
324 for i in 0..100 {
326 interner.intern(&format!("grow_test_{i}"));
327 }
328 assert_eq!(interner.len(), 100);
329 for i in 0..100 {
331 let id = interner.intern(&format!("grow_test_{i}"));
332 assert_eq!(interner.lookup(id), format!("grow_test_{i}"));
333 }
334 }
335
336 #[test]
337 fn test_many_dedup_same_string() {
338 let mut interner = StringInterner::new();
339 let id = interner.intern("same");
340 for _ in 0..1000 {
341 let new_id = interner.intern("same");
342 assert_eq!(new_id, id);
343 }
344 assert_eq!(interner.len(), 1);
345 }
346
347 #[test]
348 fn test_interner_with_capacity() {
349 let mut interner = StringInterner::with_capacity(1024, 50);
350 assert_eq!(interner.len(), 0);
351 for i in 0..50 {
352 interner.intern(&format!("cap_test_{i}"));
353 }
354 assert_eq!(interner.len(), 50);
355 }
356
357 #[test]
358 fn test_case_sensitive_dedup() {
359 let mut interner = StringInterner::new();
360 let a = interner.intern("Hello");
361 let b = interner.intern("hello");
362 assert_ne!(a, b); }
364
365 #[test]
366 fn test_default_impl() {
367 let mut interner: StringInterner = Default::default();
368 let id = interner.intern("default");
369 assert_eq!(interner.lookup(id), "default");
370 }
371
372 #[test]
373 fn test_is_empty_method() {
374 let mut interner = StringInterner::new();
375 assert!(interner.is_empty());
376 interner.intern("x");
377 assert!(!interner.is_empty());
378 }
379}