1use ahash::RandomState;
2use std::fmt;
3
4const EMPTY: u8 = 0xFF;
6#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
9#[repr(transparent)]
10pub struct StrId(u32);
11
12impl StrId {
13 pub const EMPTY: StrId = StrId(u32::MAX);
14
15 #[inline]
16 pub const fn is_empty(self) -> bool {
17 self.0 == u32::MAX
18 }
19
20 #[inline]
21 pub const fn as_u32(self) -> u32 {
22 self.0
23 }
24}
25
26impl fmt::Display for StrId {
27 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28 if self.is_empty() {
29 write!(f, "<empty>")
30 } else {
31 write!(f, "StrId({})", self.0)
32 }
33 }
34}
35
36#[inline(always)]
39const fn h2(hash: u64) -> u8 {
40 (hash & 0x7F) as u8
41}
42
43#[inline(always)]
45const fn h1(hash: u64, mask: usize) -> usize {
46 ((hash >> 7) as usize) & mask
47}
48
49pub struct StringInterner {
50 arena: String,
51 offsets: Vec<u32>,
52 ctrl: Vec<u8>,
54 table_hashes: Vec<u64>,
55 table_ids: Vec<StrId>,
56 table_mask: usize,
57 count: usize,
58 hasher: RandomState,
59}
60
61impl StringInterner {
62 pub fn new() -> Self {
63 const CAP: usize = 256;
64 Self {
65 arena: String::with_capacity(8192),
66 offsets: vec![0],
67 ctrl: vec![EMPTY; CAP],
68 table_hashes: vec![0; CAP],
69 table_ids: vec![StrId::EMPTY; CAP],
70 table_mask: CAP - 1,
71 count: 0,
72 hasher: RandomState::new(),
73 }
74 }
75
76 pub fn with_capacity(string_capacity: usize, estimated_strings: usize) -> Self {
77 let cap = estimated_strings.next_power_of_two().max(64);
78 Self {
79 arena: String::with_capacity(string_capacity),
80 offsets: vec![0],
81 ctrl: vec![EMPTY; cap],
82 table_hashes: vec![0; cap],
83 table_ids: vec![StrId::EMPTY; cap],
84 table_mask: cap - 1,
85 count: 0,
86 hasher: RandomState::new(),
87 }
88 }
89
90 #[inline]
91 pub fn intern(&mut self, s: &str) -> StrId {
92 if s.is_empty() {
93 return StrId::EMPTY;
94 }
95 let hash = self.hasher.hash_one(s);
96 let stamp = h2(hash);
97 let mask = self.table_mask;
98 let mut idx = h1(hash, mask);
99
100 loop {
101 let c = &self.ctrl[idx];
102 if *c & 0x80 != 0 {
103 let id = self.offsets.len() as u32 - 1;
105 self.arena.push_str(s);
106 self.offsets.push(self.arena.len() as u32);
107 self.ctrl[idx] = stamp;
108 self.table_hashes[idx] = hash;
109 self.table_ids[idx] = StrId(id);
110 self.count += 1;
111 if self.count * 4 > self.ctrl.len() * 3 {
112 self.grow();
113 }
114 return StrId(id);
115 }
116 if *c == stamp && self.table_hashes[idx] == hash {
117 let existing = self.table_ids[idx].0;
118 let start = self.offsets[existing as usize] as usize;
119 let end = self.offsets[existing as usize + 1] as usize;
120 let existing_str = unsafe { self.arena.get_unchecked(start..end) };
121 if existing_str == s {
122 return StrId(existing);
123 }
124 }
125 idx = (idx + 1) & mask;
126 }
127 }
128
129 #[inline]
132 pub fn get_optional(&self, s: &str) -> Option<StrId> {
133 if s.is_empty() {
134 return Some(StrId::EMPTY);
135 }
136 let hash = self.hasher.hash_one(s);
137 let stamp = h2(hash);
138 let mask = self.table_mask;
139 let mut idx = h1(hash, mask);
140
141 for _ in 0..self.ctrl.len() {
142 let c = self.ctrl[idx];
143 if c & 0x80 != 0 {
144 return None;
145 }
146 if c == stamp && self.table_hashes[idx] == hash {
147 let existing = self.table_ids[idx].0;
148 let start = self.offsets[existing as usize] as usize;
149 let end = self.offsets[existing as usize + 1] as usize;
150 if unsafe { self.arena.get_unchecked(start..end) == s } {
151 return Some(StrId(existing));
152 }
153 }
154 idx = (idx + 1) & mask;
155 }
156 None
157 }
158
159 #[inline]
160 pub fn lookup(&self, id: StrId) -> &str {
161 if id.is_empty() {
162 return "";
163 }
164 let start = self.offsets[id.0 as usize] as usize;
165 let end = self.offsets[id.0 as usize + 1] as usize;
166 unsafe { self.arena.get_unchecked(start..end) }
167 }
168
169 #[inline]
174 pub fn get_hash(&self, id: StrId) -> u64 {
175 if id.is_empty() {
176 return 0;
177 }
178 self.hasher.hash_one(self.lookup(id))
179 }
180
181 pub const fn len(&self) -> usize {
182 self.offsets.len() - 1
183 }
184
185 pub const fn is_empty(&self) -> bool {
186 self.len() == 0
187 }
188
189 pub const fn total_bytes(&self) -> usize {
190 self.arena.len()
191 }
192
193 fn grow(&mut self) {
194 let new_size = self.ctrl.len() * 2;
195 let new_mask = new_size - 1;
196 let mut new_ctrl = vec![EMPTY; new_size];
197 let mut new_hashes = vec![0u64; new_size];
198 let mut new_ids = vec![StrId::EMPTY; new_size];
199
200 for i in 0..self.ctrl.len() {
201 if self.ctrl[i] & 0x80 == 0 {
202 let hash = self.table_hashes[i];
203 let stamp = h2(hash);
204 let mut idx = h1(hash, new_mask);
205 while new_ctrl[idx] & 0x80 == 0 {
206 idx = (idx + 1) & new_mask;
207 }
208 new_ctrl[idx] = stamp;
209 new_hashes[idx] = hash;
210 new_ids[idx] = self.table_ids[i];
211 }
212 }
213
214 self.ctrl = new_ctrl;
215 self.table_hashes = new_hashes;
216 self.table_ids = new_ids;
217 self.table_mask = new_mask;
218 }
219}
220
221impl Default for StringInterner {
222 fn default() -> Self {
223 Self::new()
224 }
225}
226
227#[cfg(test)]
228mod tests {
229 use super::*;
230
231 #[test]
232 fn test_intern_empty() {
233 let mut interner = StringInterner::new();
234 assert!(interner.intern("").is_empty());
235 }
236
237 #[test]
238 fn test_intern_dedup() {
239 let mut interner = StringInterner::new();
240 let a = interner.intern("hello");
241 let b = interner.intern("hello");
242 assert_eq!(a, b);
243 }
244
245 #[test]
246 fn test_intern_unique() {
247 let mut interner = StringInterner::new();
248 let a = interner.intern("hello");
249 let b = interner.intern("world");
250 assert_ne!(a, b);
251 }
252
253 #[test]
254 fn test_lookup() {
255 let mut interner = StringInterner::new();
256 let id = interner.intern("hello world");
257 assert_eq!(interner.lookup(id), "hello world");
258 }
259
260 #[test]
261 fn test_large_intern() {
262 let mut interner = StringInterner::new();
263 let mut ids = Vec::new();
264 for i in 0..1000 {
265 let s = format!("string_{i}");
266 ids.push(interner.intern(&s));
267 }
268 for (i, &id) in ids.iter().enumerate() {
269 let expected = format!("string_{i}");
270 assert_eq!(interner.lookup(id), expected);
271 }
272 assert_eq!(interner.len(), 1000);
273 }
274
275 #[test]
276 fn test_lookup_empty_id() {
277 let interner = StringInterner::new();
278 assert_eq!(interner.lookup(StrId::EMPTY), "");
279 }
280
281 #[test]
282 fn test_get_hash_empty_id() {
283 let interner = StringInterner::new();
284 assert_eq!(interner.get_hash(StrId::EMPTY), 0);
285 }
286
287 #[test]
288 fn test_get_hash_consistency() {
289 let mut interner = StringInterner::new();
290 let id = interner.intern("consistent");
291 let hash1 = interner.get_hash(id);
292 let id2 = interner.intern("consistent");
293 assert_eq!(id, id2);
294 let hash2 = interner.get_hash(id2);
295 assert_eq!(hash1, hash2);
296 }
297
298 #[test]
299 fn test_total_bytes() {
300 let mut interner = StringInterner::new();
301 assert_eq!(interner.total_bytes(), 0);
302 interner.intern("abc");
303 assert_eq!(interner.total_bytes(), 3);
304 interner.intern("defg");
305 assert_eq!(interner.total_bytes(), 7);
306 interner.intern("abc"); assert_eq!(interner.total_bytes(), 7);
308 }
309
310 #[test]
311 fn test_intern_empty_via_lookup() {
312 let mut interner = StringInterner::new();
313 let e = interner.intern("");
314 assert!(e.is_empty());
315 let e2 = interner.intern("");
317 assert!(e2.is_empty());
318 }
319
320 #[test]
321 fn test_grow_triggers() {
322 let mut interner = StringInterner::with_capacity(4096, 16);
323 for i in 0..100 {
325 interner.intern(&format!("grow_test_{i}"));
326 }
327 assert_eq!(interner.len(), 100);
328 for i in 0..100 {
330 let id = interner.intern(&format!("grow_test_{i}"));
331 assert_eq!(interner.lookup(id), format!("grow_test_{i}"));
332 }
333 }
334
335 #[test]
336 fn test_many_dedup_same_string() {
337 let mut interner = StringInterner::new();
338 let id = interner.intern("same");
339 for _ in 0..1000 {
340 let new_id = interner.intern("same");
341 assert_eq!(new_id, id);
342 }
343 assert_eq!(interner.len(), 1);
344 }
345
346 #[test]
347 fn test_interner_with_capacity() {
348 let mut interner = StringInterner::with_capacity(1024, 50);
349 assert_eq!(interner.len(), 0);
350 for i in 0..50 {
351 interner.intern(&format!("cap_test_{i}"));
352 }
353 assert_eq!(interner.len(), 50);
354 }
355
356 #[test]
357 fn test_case_sensitive_dedup() {
358 let mut interner = StringInterner::new();
359 let a = interner.intern("Hello");
360 let b = interner.intern("hello");
361 assert_ne!(a, b); }
363
364 #[test]
365 fn test_default_impl() {
366 let mut interner: StringInterner = Default::default();
367 let id = interner.intern("default");
368 assert_eq!(interner.lookup(id), "default");
369 }
370
371 #[test]
372 fn test_is_empty_method() {
373 let mut interner = StringInterner::new();
374 assert!(interner.is_empty());
375 interner.intern("x");
376 assert!(!interner.is_empty());
377 }
378}