sochdb_core/lockfree_interner.rs
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Lock-Free String Interner with Chunked Append-Only Storage
19//!
20//! This module implements a high-performance string interner that:
21//! - Uses chunked append-only storage (never moves existing strings)
22//! - Provides lock-free reads via pre-allocated fixed buffers
23//! - Uses sharded hash maps to reduce write contention
24//! - Supports zero-cost string comparison via Symbol handles
25//!
26//! # Concurrency Guarantees - PLEASE READ CAREFULLY
27//!
28//! Despite the module name, this interner is **NOT fully lock-free**.
29//! The "lock-free" in the name refers specifically to the read path.
30//!
31//! ## Threading Guarantees Table
32//!
33//! | Operation | Guarantee | Notes |
34//! |-----------|-----------|-------|
35//! | `resolve()` | Wait-free | Atomic pointer traversal, no locks |
36//! | `get()` | Low-contention | Hash lookup with sharded RwLocks |
37//! | `intern()` | Blocking | Requires write lock on shard |
38//! | `len()` | Wait-free | Atomic counter |
39//!
40//! ## What IS Provided
41//!
42//! - **Thread-safety**: All operations are safe from multiple threads
43//! - **Wait-free reads**: `resolve()` uses atomic chunk traversal
44//! - **Low-contention lookups**: 256-way sharding reduces lock contention
45//! - **Deadlock-freedom**: Single lock per shard, no nested locking
46//!
47//! ## What is NOT Provided
48//!
49//! - **Lock-free writes**: Interning requires write locks on shards
50//! - **Wait-free interning**: Writers may block under contention
51//!
52//! ## Progress Taxonomy
53//!
54//! ```text
55//! wait-free ⊂ lock-free ⊂ obstruction-free ⊂ blocking
56//! ↑ ↑
57//! resolve() (not provided)
58//! ```
59//!
60//! This implementation provides:
61//! - **Wait-free reads**: Always complete in bounded steps
62//! - **Blocking writes**: May wait for lock acquisition
63//!
64//! ## Memory Layout
65//!
66//! Each chunk is a fixed-size pre-allocated buffer using `Box<[MaybeUninit<u8>]>`.
67//! This eliminates the data race that would occur with `Vec<u8>` (whose metadata
68//! like `len` and `capacity` are not thread-safe to modify concurrently).
69//!
70//! ```text
71//! Chunk 0 (1MB fixed) Chunk 1 (1MB fixed) Chunk 2 (1MB fixed)
72//! ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
73//! │ "hello" │ │ "different" │ │ "more strings" │
74//! │ "world" │ │ "strings" │ │ ... │
75//! │ "foo" │ │ "here" │ │ │
76//! │ [uninitialized] │ │ [uninitialized] │ │ [uninitialized] │
77//! └─────────────────┘ └─────────────────┘ └─────────────────┘
78//! ↑ ↑ ↑
79//! │ │ │
80//! AtomicPtr chain for lock-free traversal
81//! ```
82
83use parking_lot::RwLock as ParkingLotRwLock;
84use std::collections::HashMap;
85use std::hash::{Hash, Hasher};
86use std::sync::RwLock;
87use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicUsize, Ordering, fence};
88
89/// Number of shards for the hash map (power of 2 for fast modulo)
90const SHARD_COUNT: usize = 256;
91
92/// Default chunk size (1MB)
93const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024;
94
95/// Symbol handle - cheap to copy and compare
96#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
97pub struct Symbol(pub u32);
98
99impl Symbol {
100 /// Invalid/null symbol
101 pub const NULL: Symbol = Symbol(u32::MAX);
102
103 /// Get the raw index
104 pub fn index(self) -> u32 {
105 self.0
106 }
107}
108
109/// Reference to a string location in a chunk
110#[derive(Clone, Copy)]
111struct StringRef {
112 /// Chunk index
113 chunk_idx: u32,
114 /// Offset within chunk
115 offset: u32,
116 /// Length of string
117 length: u32,
118}
119
120/// A single chunk of string storage using pre-allocated fixed buffer
121///
122/// ## Safety Rationale
123///
124/// This design eliminates the data race present in the Vec-based approach:
125/// - `capacity` is immutable after construction
126/// - `data` is a fixed-size buffer that never resizes
127/// - `write_pos` atomically reserves disjoint byte ranges
128/// - Each thread writes only to its reserved range
129///
130/// The only shared mutable state is the byte range reservation via `write_pos`,
131/// which is handled atomically with `fetch_add`.
132struct Chunk {
133 /// Fixed-size pre-allocated buffer (never resized)
134 /// Using MaybeUninit because we write to reserved ranges atomically
135 data: *mut u8,
136 /// Buffer capacity (immutable after creation)
137 capacity: usize,
138 /// Current write position (atomically updated)
139 write_pos: AtomicUsize,
140 /// Next chunk in chain
141 next: AtomicPtr<Chunk>,
142}
143
144impl Chunk {
145 fn new(capacity: usize) -> Box<Self> {
146 // Allocate the buffer as a Vec<u8> with zeros
147 let buffer: Vec<u8> = vec![0u8; capacity];
148 let ptr = Box::into_raw(buffer.into_boxed_slice()) as *mut u8;
149
150 Box::new(Self {
151 data: ptr,
152 capacity,
153 write_pos: AtomicUsize::new(0),
154 next: AtomicPtr::new(std::ptr::null_mut()),
155 })
156 }
157
158 /// Try to append a string, returns offset if successful
159 ///
160 /// ## Safety
161 ///
162 /// This is safe because:
163 /// 1. `fetch_add` atomically reserves a disjoint byte range
164 /// 2. Each thread writes only to its reserved range
165 /// 3. The buffer is pre-allocated and never resized (no Vec metadata race)
166 /// 4. Release fence ensures bytes are visible before returning
167 fn try_append(&self, s: &str) -> Option<u32> {
168 let bytes = s.as_bytes();
169 let len = bytes.len();
170
171 if len == 0 {
172 // Empty strings can share offset 0 with length 0
173 return Some(0);
174 }
175
176 // Atomically reserve space
177 let offset = self.write_pos.fetch_add(len, Ordering::Relaxed);
178
179 // Check if we have room
180 if offset + len > self.capacity {
181 // Rollback reservation (best effort - may waste some space on high contention)
182 // This is safe because the space is simply unused
183 self.write_pos.fetch_sub(len, Ordering::Relaxed);
184 return None;
185 }
186
187 // SAFETY: We have exclusive access to [offset..offset+len] via atomic reservation
188 // The buffer was pre-allocated with `capacity` bytes
189 // No other thread will write to this range
190 unsafe {
191 std::ptr::copy_nonoverlapping(bytes.as_ptr(), self.data.add(offset), len);
192 }
193
194 // Release fence ensures bytes are visible to other threads
195 // before we return the StringRef (which gets published to the hash map)
196 fence(Ordering::Release);
197
198 Some(offset as u32)
199 }
200
201 /// Read a string at offset (lock-free)
202 ///
203 /// ## Safety
204 ///
205 /// Caller must ensure:
206 /// 1. The offset and length were returned by a prior `try_append`
207 /// 2. An Acquire fence was issued after observing the StringRef
208 fn read(&self, offset: u32, len: u32) -> &str {
209 // Acquire fence ensures we see the bytes written by try_append
210 fence(Ordering::Acquire);
211
212 // SAFETY:
213 // - offset and len came from a successful try_append
214 // - The buffer is stable (never moved or reallocated)
215 // - Acquire fence synchronized with Release fence in try_append
216 unsafe {
217 let bytes = std::slice::from_raw_parts(self.data.add(offset as usize), len as usize);
218 // SAFETY: we only store valid UTF-8 from str::as_bytes()
219 std::str::from_utf8_unchecked(bytes)
220 }
221 }
222}
223
224impl Drop for Chunk {
225 fn drop(&mut self) {
226 // Reconstruct and drop the boxed slice
227 unsafe {
228 let slice = std::slice::from_raw_parts_mut(self.data, self.capacity);
229 drop(Box::from_raw(slice as *mut [u8]));
230 }
231 }
232}
233
234/// Chunked append-only storage for strings
235pub struct ChunkedStorage {
236 /// Head of chunk chain
237 head: AtomicPtr<Chunk>,
238 /// Current chunk for new allocations
239 current: AtomicPtr<Chunk>,
240 /// Chunk size
241 chunk_size: usize,
242 /// Number of chunks
243 chunk_count: AtomicU32,
244}
245
246impl ChunkedStorage {
247 fn new(chunk_size: usize) -> Self {
248 let initial = Box::into_raw(Chunk::new(chunk_size));
249 Self {
250 head: AtomicPtr::new(initial),
251 current: AtomicPtr::new(initial),
252 chunk_size,
253 chunk_count: AtomicU32::new(1),
254 }
255 }
256
257 /// Append a string, returns reference
258 fn append(&self, s: &str) -> StringRef {
259 loop {
260 let current = self.current.load(Ordering::Acquire);
261 let chunk = unsafe { &*current };
262
263 // Try to append to current chunk
264 if let Some(offset) = chunk.try_append(s) {
265 let chunk_idx = self.get_chunk_index(current);
266 return StringRef {
267 chunk_idx,
268 offset,
269 length: s.len() as u32,
270 };
271 }
272
273 // Need new chunk
274 self.grow_chunk(current);
275 }
276 }
277
278 /// Grow by adding a new chunk
279 fn grow_chunk(&self, expected_current: *mut Chunk) {
280 let new_chunk = Box::into_raw(Chunk::new(self.chunk_size));
281
282 // Link new chunk to current
283 let current = unsafe { &*expected_current };
284
285 // Try to set next pointer
286 if current
287 .next
288 .compare_exchange(
289 std::ptr::null_mut(),
290 new_chunk,
291 Ordering::AcqRel,
292 Ordering::Relaxed,
293 )
294 .is_ok()
295 {
296 // We successfully linked the new chunk
297 self.current.store(new_chunk, Ordering::Release);
298 self.chunk_count.fetch_add(1, Ordering::Relaxed);
299 } else {
300 // Another thread already grew - free our chunk
301 unsafe {
302 drop(Box::from_raw(new_chunk));
303 }
304 // Update current to the actual new chunk
305 let actual_next = current.next.load(Ordering::Acquire);
306 if !actual_next.is_null() {
307 self.current.store(actual_next, Ordering::Release);
308 }
309 }
310 }
311
312 /// Get chunk by index (lock-free traversal)
313 fn get_chunk(&self, idx: u32) -> Option<&Chunk> {
314 let mut current = self.head.load(Ordering::Acquire);
315 let mut i = 0u32;
316
317 while !current.is_null() {
318 if i == idx {
319 return Some(unsafe { &*current });
320 }
321 current = unsafe { (*current).next.load(Ordering::Acquire) };
322 i += 1;
323 }
324
325 None
326 }
327
328 /// Get index of a chunk pointer
329 fn get_chunk_index(&self, ptr: *mut Chunk) -> u32 {
330 let mut current = self.head.load(Ordering::Acquire);
331 let mut i = 0u32;
332
333 while !current.is_null() {
334 if current == ptr {
335 return i;
336 }
337 current = unsafe { (*current).next.load(Ordering::Acquire) };
338 i += 1;
339 }
340
341 0 // Fallback (shouldn't happen)
342 }
343
344 /// Read a string by reference (lock-free)
345 fn read(&self, string_ref: StringRef) -> Option<&str> {
346 let chunk = self.get_chunk(string_ref.chunk_idx)?;
347 Some(chunk.read(string_ref.offset, string_ref.length))
348 }
349}
350
351impl Drop for ChunkedStorage {
352 fn drop(&mut self) {
353 let mut current = self.head.load(Ordering::Relaxed);
354 while !current.is_null() {
355 let chunk = unsafe { Box::from_raw(current) };
356 current = chunk.next.load(Ordering::Relaxed);
357 }
358 }
359}
360
361/// Hasher for string lookup
362fn hash_string(s: &str) -> u64 {
363 let mut hasher = std::collections::hash_map::DefaultHasher::new();
364 s.hash(&mut hasher);
365 hasher.finish()
366}
367
368/// Lock-free string interner
369pub struct LockFreeInterner {
370 /// Sharded hash maps: string hash -> Symbol
371 shards: [ParkingLotRwLock<HashMap<u64, (Symbol, StringRef)>>; SHARD_COUNT],
372 /// Chunked string storage
373 storage: ChunkedStorage,
374 /// Next symbol ID
375 next_symbol: AtomicU32,
376 /// Reverse mapping: Symbol -> StringRef (for resolve)
377 symbols: RwLock<Vec<StringRef>>,
378 /// Statistics
379 stats: InternerStats,
380}
381
382/// Interner statistics
383#[derive(Default)]
384pub struct InternerStats {
385 pub interned_count: AtomicU32,
386 pub lookup_hits: AtomicU32,
387 pub lookup_misses: AtomicU32,
388 pub resolve_count: AtomicU32,
389}
390
391impl Default for LockFreeInterner {
392 fn default() -> Self {
393 Self::new()
394 }
395}
396
397impl LockFreeInterner {
398 /// Create a new interner
399 pub fn new() -> Self {
400 Self::with_chunk_size(DEFAULT_CHUNK_SIZE)
401 }
402
403 /// Create with custom chunk size
404 pub fn with_chunk_size(chunk_size: usize) -> Self {
405 Self {
406 shards: std::array::from_fn(|_| ParkingLotRwLock::new(HashMap::new())),
407 storage: ChunkedStorage::new(chunk_size),
408 next_symbol: AtomicU32::new(0),
409 symbols: RwLock::new(Vec::new()),
410 stats: InternerStats::default(),
411 }
412 }
413
414 /// Get shard index for a hash
415 #[inline]
416 fn shard_index(&self, hash: u64) -> usize {
417 (hash as usize) % SHARD_COUNT
418 }
419
420 /// Get or intern a string
421 pub fn get_or_intern(&self, s: &str) -> Symbol {
422 let hash = hash_string(s);
423 let shard_idx = self.shard_index(hash);
424
425 // Fast path: read-only lookup
426 {
427 let shard = self.shards[shard_idx].read();
428 if let Some(&(symbol, _)) = shard.get(&hash) {
429 self.stats.lookup_hits.fetch_add(1, Ordering::Relaxed);
430 return symbol;
431 }
432 }
433
434 self.stats.lookup_misses.fetch_add(1, Ordering::Relaxed);
435
436 // Slow path: need to intern
437 let mut shard = self.shards[shard_idx].write();
438
439 // Double-check after acquiring write lock
440 if let Some(&(symbol, _)) = shard.get(&hash) {
441 return symbol;
442 }
443
444 // Allocate symbol and store string
445 let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
446 let string_ref = self.storage.append(s);
447
448 shard.insert(hash, (symbol, string_ref));
449
450 // Update reverse mapping
451 {
452 let mut symbols = self.symbols.write().unwrap();
453 // Ensure vector is large enough
454 while symbols.len() <= symbol.0 as usize {
455 symbols.push(StringRef {
456 chunk_idx: 0,
457 offset: 0,
458 length: 0,
459 });
460 }
461 symbols[symbol.0 as usize] = string_ref;
462 }
463
464 self.stats.interned_count.fetch_add(1, Ordering::Relaxed);
465 symbol
466 }
467
468 /// Resolve a symbol to its string (lock-free for reads!)
469 pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
470 if symbol == Symbol::NULL {
471 return None;
472 }
473
474 self.stats.resolve_count.fetch_add(1, Ordering::Relaxed);
475
476 // Read from symbols vector (only takes read lock briefly)
477 let string_ref = {
478 let symbols = self.symbols.read().unwrap();
479 symbols.get(symbol.0 as usize).copied()
480 }?;
481
482 // Storage read is completely lock-free
483 self.storage.read(string_ref)
484 }
485
486 /// Get the number of interned strings
487 pub fn len(&self) -> usize {
488 self.next_symbol.load(Ordering::Relaxed) as usize
489 }
490
491 /// Check if empty
492 pub fn is_empty(&self) -> bool {
493 self.len() == 0
494 }
495
496 /// Get statistics
497 pub fn stats(&self) -> &InternerStats {
498 &self.stats
499 }
500
501 /// Get total memory used by string storage
502 pub fn storage_bytes(&self) -> usize {
503 self.storage.chunk_count.load(Ordering::Relaxed) as usize * self.storage.chunk_size
504 }
505}
506
507// SAFETY: LockFreeInterner is safe to share across threads because:
508// 1. All hash map access is protected by ParkingLotRwLock (sharded)
509// 2. ChunkedStorage uses atomic operations for all shared mutable state
510// 3. Chunk buffers are pre-allocated fixed-size (no Vec metadata races)
511// 4. Memory ordering is correct (Release on write, Acquire on read)
512unsafe impl Send for LockFreeInterner {}
513unsafe impl Sync for LockFreeInterner {}
514
515// SAFETY: ChunkedStorage is safe to share across threads because:
516// 1. write_pos uses atomic fetch_add for disjoint range reservation
517// 2. Chunk buffers are fixed-size pre-allocations (immutable capacity)
518// 3. Chunk chain traversal uses AtomicPtr with proper ordering
519// 4. Each writer has exclusive access to its reserved byte range
520unsafe impl Send for ChunkedStorage {}
521unsafe impl Sync for ChunkedStorage {}
522
523// SAFETY: Chunk contains a raw pointer but is designed for concurrent access:
524// 1. data pointer is stable (never moved after allocation)
525// 2. capacity is immutable after construction
526// 3. write_pos is atomic
527// 4. Each thread writes only to atomically-reserved disjoint ranges
528unsafe impl Send for Chunk {}
529unsafe impl Sync for Chunk {}
530
531#[cfg(test)]
532mod tests {
533 use super::*;
534 use std::sync::Arc;
535 use std::thread;
536
537 #[test]
538 fn test_basic_intern() {
539 let interner = LockFreeInterner::new();
540
541 let s1 = interner.get_or_intern("hello");
542 let s2 = interner.get_or_intern("world");
543 let s3 = interner.get_or_intern("hello");
544
545 // Same string should give same symbol
546 assert_eq!(s1, s3);
547 // Different strings should give different symbols
548 assert_ne!(s1, s2);
549
550 assert_eq!(interner.len(), 2);
551 }
552
553 #[test]
554 fn test_resolve() {
555 let interner = LockFreeInterner::new();
556
557 let s1 = interner.get_or_intern("hello");
558 let s2 = interner.get_or_intern("world");
559
560 assert_eq!(interner.resolve(s1), Some("hello"));
561 assert_eq!(interner.resolve(s2), Some("world"));
562 assert_eq!(interner.resolve(Symbol::NULL), None);
563 }
564
565 #[test]
566 fn test_concurrent_intern() {
567 let interner = Arc::new(LockFreeInterner::new());
568 let mut handles = vec![];
569
570 // 8 threads each intern the same 100 strings
571 for _ in 0..8 {
572 let interner = interner.clone();
573 handles.push(thread::spawn(move || {
574 let mut symbols = Vec::new();
575 for i in 0..100 {
576 let s = format!("string_{}", i);
577 symbols.push(interner.get_or_intern(&s));
578 }
579 symbols
580 }));
581 }
582
583 let results: Vec<_> = handles.into_iter().map(|h| h.join().unwrap()).collect();
584
585 // All threads should get the same symbols for the same strings
586 for i in 0..100 {
587 let first = results[0][i];
588 for result in &results[1..] {
589 assert_eq!(result[i], first, "Symbol mismatch for string_{}", i);
590 }
591 }
592
593 // Should have exactly 100 unique strings
594 assert_eq!(interner.len(), 100);
595 }
596
597 #[test]
598 fn test_concurrent_resolve() {
599 let interner = Arc::new(LockFreeInterner::new());
600
601 // Intern some strings first
602 let symbols: Vec<_> = (0..100)
603 .map(|i| interner.get_or_intern(&format!("string_{}", i)))
604 .collect();
605
606 let mut handles = vec![];
607
608 // 8 threads each resolve all strings many times
609 for _ in 0..8 {
610 let interner = interner.clone();
611 let symbols = symbols.clone();
612 handles.push(thread::spawn(move || {
613 for _ in 0..100 {
614 for (i, &symbol) in symbols.iter().enumerate() {
615 let resolved = interner.resolve(symbol);
616 assert_eq!(resolved, Some(format!("string_{}", i)).as_deref());
617 }
618 }
619 }));
620 }
621
622 for handle in handles {
623 handle.join().unwrap();
624 }
625
626 // Should have many resolve operations
627 assert!(interner.stats().resolve_count.load(Ordering::Relaxed) > 0);
628 }
629
630 #[test]
631 fn test_chunk_growth() {
632 // Small chunk size to force growth
633 let interner = LockFreeInterner::with_chunk_size(1024);
634
635 // Intern enough strings to fill multiple chunks
636 for i in 0..1000 {
637 let s = format!("this_is_a_longer_string_number_{:05}", i);
638 interner.get_or_intern(&s);
639 }
640
641 // Should have grown to multiple chunks
642 assert!(interner.storage.chunk_count.load(Ordering::Relaxed) > 1);
643
644 // All strings should still be resolvable
645 for i in 0..1000 {
646 let s = format!("this_is_a_longer_string_number_{:05}", i);
647 let symbol = interner.get_or_intern(&s);
648 assert_eq!(interner.resolve(symbol), Some(s.as_str()));
649 }
650 }
651
652 #[test]
653 fn test_symbol_comparison() {
654 let interner = LockFreeInterner::new();
655
656 let s1 = interner.get_or_intern("hello");
657 let s2 = interner.get_or_intern("world");
658 let s3 = interner.get_or_intern("hello");
659
660 // Symbol comparison is O(1) - just comparing u32
661 assert_eq!(s1, s3);
662 assert_ne!(s1, s2);
663
664 // Can use as hash map key
665 let mut map = HashMap::new();
666 map.insert(s1, "value1");
667 map.insert(s2, "value2");
668
669 assert_eq!(map.get(&s3), Some(&"value1"));
670 }
671
672 #[test]
673 fn test_stats() {
674 let interner = LockFreeInterner::new();
675
676 // Intern some strings
677 interner.get_or_intern("hello");
678 interner.get_or_intern("world");
679 interner.get_or_intern("hello"); // Hit
680
681 let stats = interner.stats();
682 assert_eq!(stats.interned_count.load(Ordering::Relaxed), 2);
683 assert!(stats.lookup_hits.load(Ordering::Relaxed) >= 1);
684 assert!(stats.lookup_misses.load(Ordering::Relaxed) >= 2);
685 }
686
687 #[test]
688 fn test_concurrent_intern_and_resolve() {
689 let interner = Arc::new(LockFreeInterner::new());
690 let mut handles = vec![];
691
692 // Writers: intern new strings
693 for writer_id in 0..4 {
694 let interner = interner.clone();
695 handles.push(thread::spawn(move || {
696 for i in 0..100 {
697 let s = format!("writer_{}_string_{}", writer_id, i);
698 interner.get_or_intern(&s);
699 }
700 }));
701 }
702
703 // Readers: continuously resolve existing strings
704 for reader_id in 0..4 {
705 let interner = interner.clone();
706 handles.push(thread::spawn(move || {
707 for i in 0..1000 {
708 // Intern and immediately resolve
709 let s = format!("common_string_{}", (i + reader_id) % 10);
710 let symbol = interner.get_or_intern(&s);
711 let resolved = interner.resolve(symbol);
712 assert!(resolved.is_some());
713 }
714 }));
715 }
716
717 for handle in handles {
718 handle.join().unwrap();
719 }
720 }
721
722 #[test]
723 fn test_empty_string() {
724 let interner = LockFreeInterner::new();
725
726 let s = interner.get_or_intern("");
727 assert_eq!(interner.resolve(s), Some(""));
728 }
729
730 #[test]
731 fn test_long_strings() {
732 let interner = LockFreeInterner::new();
733
734 let long_string: String = (0..10000).map(|_| 'x').collect();
735 let symbol = interner.get_or_intern(&long_string);
736 assert_eq!(interner.resolve(symbol), Some(long_string.as_str()));
737 }
738
739 #[test]
740 fn test_unicode_strings() {
741 let interner = LockFreeInterner::new();
742
743 let s1 = interner.get_or_intern("hello 世界 🌍");
744 let s2 = interner.get_or_intern("émojis: 🎉🎊🎁");
745
746 assert_eq!(interner.resolve(s1), Some("hello 世界 🌍"));
747 assert_eq!(interner.resolve(s2), Some("émojis: 🎉🎊🎁"));
748 }
749
750 #[test]
751 fn test_shard_distribution() {
752 let interner = LockFreeInterner::new();
753
754 // Intern many strings
755 for i in 0..10000 {
756 interner.get_or_intern(&format!("key_{}", i));
757 }
758
759 // Check that strings are distributed across shards
760 let mut non_empty = 0;
761 for shard in &interner.shards {
762 if !shard.read().is_empty() {
763 non_empty += 1;
764 }
765 }
766
767 // With 10000 strings across 256 shards, should use many shards
768 assert!(
769 non_empty > 200,
770 "Expected better shard distribution: {} non-empty",
771 non_empty
772 );
773 }
774}