sochdb_core/
lockfree_interner.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Lock-Free String Interner with Chunked Append-Only Storage
16//!
17//! This module implements a high-performance string interner that:
18//! - Uses chunked append-only storage (never moves existing strings)
19//! - Provides lock-free reads via pre-allocated fixed buffers
20//! - Uses sharded hash maps to reduce write contention
21//! - Supports zero-cost string comparison via Symbol handles
22//!
23//! # Concurrency Guarantees - PLEASE READ CAREFULLY
24//!
25//! Despite the module name, this interner is **NOT fully lock-free**.
26//! The "lock-free" in the name refers specifically to the read path.
27//!
28//! ## Threading Guarantees Table
29//!
30//! | Operation | Guarantee | Notes |
31//! |-----------|-----------|-------|
32//! | `resolve()` | Wait-free | Atomic pointer traversal, no locks |
33//! | `get()` | Low-contention | Hash lookup with sharded RwLocks |
34//! | `intern()` | Blocking | Requires write lock on shard |
35//! | `len()` | Wait-free | Atomic counter |
36//!
37//! ## What IS Provided
38//!
39//! - **Thread-safety**: All operations are safe from multiple threads
40//! - **Wait-free reads**: `resolve()` uses atomic chunk traversal
41//! - **Low-contention lookups**: 256-way sharding reduces lock contention
42//! - **Deadlock-freedom**: Single lock per shard, no nested locking
43//!
44//! ## What is NOT Provided
45//!
46//! - **Lock-free writes**: Interning requires write locks on shards
47//! - **Wait-free interning**: Writers may block under contention
48//!
49//! ## Progress Taxonomy
50//!
51//! ```text
52//! wait-free ⊂ lock-free ⊂ obstruction-free ⊂ blocking
53//!     ↑           ↑
54//!  resolve()   (not provided)
55//! ```
56//!
57//! This implementation provides:
58//! - **Wait-free reads**: Always complete in bounded steps
59//! - **Blocking writes**: May wait for lock acquisition
60//!
61//! ## Memory Layout
62//!
63//! Each chunk is a fixed-size pre-allocated buffer using `Box<[MaybeUninit<u8>]>`.
64//! This eliminates the data race that would occur with `Vec<u8>` (whose metadata
65//! like `len` and `capacity` are not thread-safe to modify concurrently).
66//!
67//! ```text
68//! Chunk 0 (1MB fixed)       Chunk 1 (1MB fixed)       Chunk 2 (1MB fixed)
69//! ┌─────────────────┐       ┌─────────────────┐       ┌─────────────────┐
70//! │ "hello"         │       │ "different"     │       │ "more strings"  │
71//! │ "world"         │       │ "strings"       │       │ ...             │
72//! │ "foo"           │       │ "here"          │       │                 │
73//! │ [uninitialized] │       │ [uninitialized] │       │ [uninitialized] │
74//! └─────────────────┘       └─────────────────┘       └─────────────────┘
75//!           ↑                        ↑                        ↑
76//!           │                        │                        │
77//!     AtomicPtr chain for lock-free traversal
78//! ```
79
80use parking_lot::RwLock as ParkingLotRwLock;
81use std::collections::HashMap;
82use std::hash::{Hash, Hasher};
83use std::sync::RwLock;
84use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicUsize, Ordering, fence};
85
86/// Number of shards for the hash map (power of 2 for fast modulo)
87const SHARD_COUNT: usize = 256;
88
89/// Default chunk size (1MB)
90const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024;
91
92/// Symbol handle - cheap to copy and compare
93#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
94pub struct Symbol(pub u32);
95
96impl Symbol {
97    /// Invalid/null symbol
98    pub const NULL: Symbol = Symbol(u32::MAX);
99
100    /// Get the raw index
101    pub fn index(self) -> u32 {
102        self.0
103    }
104}
105
106/// Reference to a string location in a chunk
107#[derive(Clone, Copy)]
108struct StringRef {
109    /// Chunk index
110    chunk_idx: u32,
111    /// Offset within chunk
112    offset: u32,
113    /// Length of string
114    length: u32,
115}
116
117/// A single chunk of string storage using pre-allocated fixed buffer
118///
119/// ## Safety Rationale
120///
121/// This design eliminates the data race present in the Vec-based approach:
122/// - `capacity` is immutable after construction
123/// - `data` is a fixed-size buffer that never resizes
124/// - `write_pos` atomically reserves disjoint byte ranges
125/// - Each thread writes only to its reserved range
126///
127/// The only shared mutable state is the byte range reservation via `write_pos`,
128/// which is handled atomically with `fetch_add`.
129struct Chunk {
130    /// Fixed-size pre-allocated buffer (never resized)
131    /// Using MaybeUninit because we write to reserved ranges atomically
132    data: *mut u8,
133    /// Buffer capacity (immutable after creation)
134    capacity: usize,
135    /// Current write position (atomically updated)
136    write_pos: AtomicUsize,
137    /// Next chunk in chain
138    next: AtomicPtr<Chunk>,
139}
140
141impl Chunk {
142    fn new(capacity: usize) -> Box<Self> {
143        // Allocate the buffer as a Vec<u8> with zeros
144        let buffer: Vec<u8> = vec![0u8; capacity];
145        let ptr = Box::into_raw(buffer.into_boxed_slice()) as *mut u8;
146
147        Box::new(Self {
148            data: ptr,
149            capacity,
150            write_pos: AtomicUsize::new(0),
151            next: AtomicPtr::new(std::ptr::null_mut()),
152        })
153    }
154
155    /// Try to append a string, returns offset if successful
156    ///
157    /// ## Safety
158    ///
159    /// This is safe because:
160    /// 1. `fetch_add` atomically reserves a disjoint byte range
161    /// 2. Each thread writes only to its reserved range
162    /// 3. The buffer is pre-allocated and never resized (no Vec metadata race)
163    /// 4. Release fence ensures bytes are visible before returning
164    fn try_append(&self, s: &str) -> Option<u32> {
165        let bytes = s.as_bytes();
166        let len = bytes.len();
167
168        if len == 0 {
169            // Empty strings can share offset 0 with length 0
170            return Some(0);
171        }
172
173        // Atomically reserve space
174        let offset = self.write_pos.fetch_add(len, Ordering::Relaxed);
175
176        // Check if we have room
177        if offset + len > self.capacity {
178            // Rollback reservation (best effort - may waste some space on high contention)
179            // This is safe because the space is simply unused
180            self.write_pos.fetch_sub(len, Ordering::Relaxed);
181            return None;
182        }
183
184        // SAFETY: We have exclusive access to [offset..offset+len] via atomic reservation
185        // The buffer was pre-allocated with `capacity` bytes
186        // No other thread will write to this range
187        unsafe {
188            std::ptr::copy_nonoverlapping(bytes.as_ptr(), self.data.add(offset), len);
189        }
190
191        // Release fence ensures bytes are visible to other threads
192        // before we return the StringRef (which gets published to the hash map)
193        fence(Ordering::Release);
194
195        Some(offset as u32)
196    }
197
198    /// Read a string at offset (lock-free)
199    ///
200    /// ## Safety
201    ///
202    /// Caller must ensure:
203    /// 1. The offset and length were returned by a prior `try_append`
204    /// 2. An Acquire fence was issued after observing the StringRef
205    fn read(&self, offset: u32, len: u32) -> &str {
206        // Acquire fence ensures we see the bytes written by try_append
207        fence(Ordering::Acquire);
208
209        // SAFETY:
210        // - offset and len came from a successful try_append
211        // - The buffer is stable (never moved or reallocated)
212        // - Acquire fence synchronized with Release fence in try_append
213        unsafe {
214            let bytes = std::slice::from_raw_parts(self.data.add(offset as usize), len as usize);
215            // SAFETY: we only store valid UTF-8 from str::as_bytes()
216            std::str::from_utf8_unchecked(bytes)
217        }
218    }
219}
220
221impl Drop for Chunk {
222    fn drop(&mut self) {
223        // Reconstruct and drop the boxed slice
224        unsafe {
225            let slice = std::slice::from_raw_parts_mut(self.data, self.capacity);
226            drop(Box::from_raw(slice as *mut [u8]));
227        }
228    }
229}
230
231/// Chunked append-only storage for strings
232pub struct ChunkedStorage {
233    /// Head of chunk chain
234    head: AtomicPtr<Chunk>,
235    /// Current chunk for new allocations
236    current: AtomicPtr<Chunk>,
237    /// Chunk size
238    chunk_size: usize,
239    /// Number of chunks
240    chunk_count: AtomicU32,
241}
242
243impl ChunkedStorage {
244    fn new(chunk_size: usize) -> Self {
245        let initial = Box::into_raw(Chunk::new(chunk_size));
246        Self {
247            head: AtomicPtr::new(initial),
248            current: AtomicPtr::new(initial),
249            chunk_size,
250            chunk_count: AtomicU32::new(1),
251        }
252    }
253
254    /// Append a string, returns reference
255    fn append(&self, s: &str) -> StringRef {
256        loop {
257            let current = self.current.load(Ordering::Acquire);
258            let chunk = unsafe { &*current };
259
260            // Try to append to current chunk
261            if let Some(offset) = chunk.try_append(s) {
262                let chunk_idx = self.get_chunk_index(current);
263                return StringRef {
264                    chunk_idx,
265                    offset,
266                    length: s.len() as u32,
267                };
268            }
269
270            // Need new chunk
271            self.grow_chunk(current);
272        }
273    }
274
275    /// Grow by adding a new chunk
276    fn grow_chunk(&self, expected_current: *mut Chunk) {
277        let new_chunk = Box::into_raw(Chunk::new(self.chunk_size));
278
279        // Link new chunk to current
280        let current = unsafe { &*expected_current };
281
282        // Try to set next pointer
283        if current
284            .next
285            .compare_exchange(
286                std::ptr::null_mut(),
287                new_chunk,
288                Ordering::AcqRel,
289                Ordering::Relaxed,
290            )
291            .is_ok()
292        {
293            // We successfully linked the new chunk
294            self.current.store(new_chunk, Ordering::Release);
295            self.chunk_count.fetch_add(1, Ordering::Relaxed);
296        } else {
297            // Another thread already grew - free our chunk
298            unsafe {
299                drop(Box::from_raw(new_chunk));
300            }
301            // Update current to the actual new chunk
302            let actual_next = current.next.load(Ordering::Acquire);
303            if !actual_next.is_null() {
304                self.current.store(actual_next, Ordering::Release);
305            }
306        }
307    }
308
309    /// Get chunk by index (lock-free traversal)
310    fn get_chunk(&self, idx: u32) -> Option<&Chunk> {
311        let mut current = self.head.load(Ordering::Acquire);
312        let mut i = 0u32;
313
314        while !current.is_null() {
315            if i == idx {
316                return Some(unsafe { &*current });
317            }
318            current = unsafe { (*current).next.load(Ordering::Acquire) };
319            i += 1;
320        }
321
322        None
323    }
324
325    /// Get index of a chunk pointer
326    fn get_chunk_index(&self, ptr: *mut Chunk) -> u32 {
327        let mut current = self.head.load(Ordering::Acquire);
328        let mut i = 0u32;
329
330        while !current.is_null() {
331            if current == ptr {
332                return i;
333            }
334            current = unsafe { (*current).next.load(Ordering::Acquire) };
335            i += 1;
336        }
337
338        0 // Fallback (shouldn't happen)
339    }
340
341    /// Read a string by reference (lock-free)
342    fn read(&self, string_ref: StringRef) -> Option<&str> {
343        let chunk = self.get_chunk(string_ref.chunk_idx)?;
344        Some(chunk.read(string_ref.offset, string_ref.length))
345    }
346}
347
348impl Drop for ChunkedStorage {
349    fn drop(&mut self) {
350        let mut current = self.head.load(Ordering::Relaxed);
351        while !current.is_null() {
352            let chunk = unsafe { Box::from_raw(current) };
353            current = chunk.next.load(Ordering::Relaxed);
354        }
355    }
356}
357
358/// Hasher for string lookup
359fn hash_string(s: &str) -> u64 {
360    let mut hasher = std::collections::hash_map::DefaultHasher::new();
361    s.hash(&mut hasher);
362    hasher.finish()
363}
364
365/// Lock-free string interner
366pub struct LockFreeInterner {
367    /// Sharded hash maps: string hash -> Symbol
368    shards: [ParkingLotRwLock<HashMap<u64, (Symbol, StringRef)>>; SHARD_COUNT],
369    /// Chunked string storage
370    storage: ChunkedStorage,
371    /// Next symbol ID
372    next_symbol: AtomicU32,
373    /// Reverse mapping: Symbol -> StringRef (for resolve)
374    symbols: RwLock<Vec<StringRef>>,
375    /// Statistics
376    stats: InternerStats,
377}
378
379/// Interner statistics
380#[derive(Default)]
381pub struct InternerStats {
382    pub interned_count: AtomicU32,
383    pub lookup_hits: AtomicU32,
384    pub lookup_misses: AtomicU32,
385    pub resolve_count: AtomicU32,
386}
387
388impl Default for LockFreeInterner {
389    fn default() -> Self {
390        Self::new()
391    }
392}
393
394impl LockFreeInterner {
395    /// Create a new interner
396    pub fn new() -> Self {
397        Self::with_chunk_size(DEFAULT_CHUNK_SIZE)
398    }
399
400    /// Create with custom chunk size
401    pub fn with_chunk_size(chunk_size: usize) -> Self {
402        Self {
403            shards: std::array::from_fn(|_| ParkingLotRwLock::new(HashMap::new())),
404            storage: ChunkedStorage::new(chunk_size),
405            next_symbol: AtomicU32::new(0),
406            symbols: RwLock::new(Vec::new()),
407            stats: InternerStats::default(),
408        }
409    }
410
411    /// Get shard index for a hash
412    #[inline]
413    fn shard_index(&self, hash: u64) -> usize {
414        (hash as usize) % SHARD_COUNT
415    }
416
417    /// Get or intern a string
418    pub fn get_or_intern(&self, s: &str) -> Symbol {
419        let hash = hash_string(s);
420        let shard_idx = self.shard_index(hash);
421
422        // Fast path: read-only lookup
423        {
424            let shard = self.shards[shard_idx].read();
425            if let Some(&(symbol, _)) = shard.get(&hash) {
426                self.stats.lookup_hits.fetch_add(1, Ordering::Relaxed);
427                return symbol;
428            }
429        }
430
431        self.stats.lookup_misses.fetch_add(1, Ordering::Relaxed);
432
433        // Slow path: need to intern
434        let mut shard = self.shards[shard_idx].write();
435
436        // Double-check after acquiring write lock
437        if let Some(&(symbol, _)) = shard.get(&hash) {
438            return symbol;
439        }
440
441        // Allocate symbol and store string
442        let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
443        let string_ref = self.storage.append(s);
444
445        shard.insert(hash, (symbol, string_ref));
446
447        // Update reverse mapping
448        {
449            let mut symbols = self.symbols.write().unwrap();
450            // Ensure vector is large enough
451            while symbols.len() <= symbol.0 as usize {
452                symbols.push(StringRef {
453                    chunk_idx: 0,
454                    offset: 0,
455                    length: 0,
456                });
457            }
458            symbols[symbol.0 as usize] = string_ref;
459        }
460
461        self.stats.interned_count.fetch_add(1, Ordering::Relaxed);
462        symbol
463    }
464
465    /// Resolve a symbol to its string (lock-free for reads!)
466    pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
467        if symbol == Symbol::NULL {
468            return None;
469        }
470
471        self.stats.resolve_count.fetch_add(1, Ordering::Relaxed);
472
473        // Read from symbols vector (only takes read lock briefly)
474        let string_ref = {
475            let symbols = self.symbols.read().unwrap();
476            symbols.get(symbol.0 as usize).copied()
477        }?;
478
479        // Storage read is completely lock-free
480        self.storage.read(string_ref)
481    }
482
483    /// Get the number of interned strings
484    pub fn len(&self) -> usize {
485        self.next_symbol.load(Ordering::Relaxed) as usize
486    }
487
488    /// Check if empty
489    pub fn is_empty(&self) -> bool {
490        self.len() == 0
491    }
492
493    /// Get statistics
494    pub fn stats(&self) -> &InternerStats {
495        &self.stats
496    }
497
498    /// Get total memory used by string storage
499    pub fn storage_bytes(&self) -> usize {
500        self.storage.chunk_count.load(Ordering::Relaxed) as usize * self.storage.chunk_size
501    }
502}
503
504// SAFETY: LockFreeInterner is safe to share across threads because:
505// 1. All hash map access is protected by ParkingLotRwLock (sharded)
506// 2. ChunkedStorage uses atomic operations for all shared mutable state
507// 3. Chunk buffers are pre-allocated fixed-size (no Vec metadata races)
508// 4. Memory ordering is correct (Release on write, Acquire on read)
509unsafe impl Send for LockFreeInterner {}
510unsafe impl Sync for LockFreeInterner {}
511
512// SAFETY: ChunkedStorage is safe to share across threads because:
513// 1. write_pos uses atomic fetch_add for disjoint range reservation
514// 2. Chunk buffers are fixed-size pre-allocations (immutable capacity)
515// 3. Chunk chain traversal uses AtomicPtr with proper ordering
516// 4. Each writer has exclusive access to its reserved byte range
517unsafe impl Send for ChunkedStorage {}
518unsafe impl Sync for ChunkedStorage {}
519
520// SAFETY: Chunk contains a raw pointer but is designed for concurrent access:
521// 1. data pointer is stable (never moved after allocation)
522// 2. capacity is immutable after construction
523// 3. write_pos is atomic
524// 4. Each thread writes only to atomically-reserved disjoint ranges
525unsafe impl Send for Chunk {}
526unsafe impl Sync for Chunk {}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531    use std::sync::Arc;
532    use std::thread;
533
534    #[test]
535    fn test_basic_intern() {
536        let interner = LockFreeInterner::new();
537
538        let s1 = interner.get_or_intern("hello");
539        let s2 = interner.get_or_intern("world");
540        let s3 = interner.get_or_intern("hello");
541
542        // Same string should give same symbol
543        assert_eq!(s1, s3);
544        // Different strings should give different symbols
545        assert_ne!(s1, s2);
546
547        assert_eq!(interner.len(), 2);
548    }
549
550    #[test]
551    fn test_resolve() {
552        let interner = LockFreeInterner::new();
553
554        let s1 = interner.get_or_intern("hello");
555        let s2 = interner.get_or_intern("world");
556
557        assert_eq!(interner.resolve(s1), Some("hello"));
558        assert_eq!(interner.resolve(s2), Some("world"));
559        assert_eq!(interner.resolve(Symbol::NULL), None);
560    }
561
562    #[test]
563    fn test_concurrent_intern() {
564        let interner = Arc::new(LockFreeInterner::new());
565        let mut handles = vec![];
566
567        // 8 threads each intern the same 100 strings
568        for _ in 0..8 {
569            let interner = interner.clone();
570            handles.push(thread::spawn(move || {
571                let mut symbols = Vec::new();
572                for i in 0..100 {
573                    let s = format!("string_{}", i);
574                    symbols.push(interner.get_or_intern(&s));
575                }
576                symbols
577            }));
578        }
579
580        let results: Vec<_> = handles.into_iter().map(|h| h.join().unwrap()).collect();
581
582        // All threads should get the same symbols for the same strings
583        for i in 0..100 {
584            let first = results[0][i];
585            for result in &results[1..] {
586                assert_eq!(result[i], first, "Symbol mismatch for string_{}", i);
587            }
588        }
589
590        // Should have exactly 100 unique strings
591        assert_eq!(interner.len(), 100);
592    }
593
594    #[test]
595    fn test_concurrent_resolve() {
596        let interner = Arc::new(LockFreeInterner::new());
597
598        // Intern some strings first
599        let symbols: Vec<_> = (0..100)
600            .map(|i| interner.get_or_intern(&format!("string_{}", i)))
601            .collect();
602
603        let mut handles = vec![];
604
605        // 8 threads each resolve all strings many times
606        for _ in 0..8 {
607            let interner = interner.clone();
608            let symbols = symbols.clone();
609            handles.push(thread::spawn(move || {
610                for _ in 0..100 {
611                    for (i, &symbol) in symbols.iter().enumerate() {
612                        let resolved = interner.resolve(symbol);
613                        assert_eq!(resolved, Some(format!("string_{}", i)).as_deref());
614                    }
615                }
616            }));
617        }
618
619        for handle in handles {
620            handle.join().unwrap();
621        }
622
623        // Should have many resolve operations
624        assert!(interner.stats().resolve_count.load(Ordering::Relaxed) > 0);
625    }
626
627    #[test]
628    fn test_chunk_growth() {
629        // Small chunk size to force growth
630        let interner = LockFreeInterner::with_chunk_size(1024);
631
632        // Intern enough strings to fill multiple chunks
633        for i in 0..1000 {
634            let s = format!("this_is_a_longer_string_number_{:05}", i);
635            interner.get_or_intern(&s);
636        }
637
638        // Should have grown to multiple chunks
639        assert!(interner.storage.chunk_count.load(Ordering::Relaxed) > 1);
640
641        // All strings should still be resolvable
642        for i in 0..1000 {
643            let s = format!("this_is_a_longer_string_number_{:05}", i);
644            let symbol = interner.get_or_intern(&s);
645            assert_eq!(interner.resolve(symbol), Some(s.as_str()));
646        }
647    }
648
649    #[test]
650    fn test_symbol_comparison() {
651        let interner = LockFreeInterner::new();
652
653        let s1 = interner.get_or_intern("hello");
654        let s2 = interner.get_or_intern("world");
655        let s3 = interner.get_or_intern("hello");
656
657        // Symbol comparison is O(1) - just comparing u32
658        assert_eq!(s1, s3);
659        assert_ne!(s1, s2);
660
661        // Can use as hash map key
662        let mut map = HashMap::new();
663        map.insert(s1, "value1");
664        map.insert(s2, "value2");
665
666        assert_eq!(map.get(&s3), Some(&"value1"));
667    }
668
669    #[test]
670    fn test_stats() {
671        let interner = LockFreeInterner::new();
672
673        // Intern some strings
674        interner.get_or_intern("hello");
675        interner.get_or_intern("world");
676        interner.get_or_intern("hello"); // Hit
677
678        let stats = interner.stats();
679        assert_eq!(stats.interned_count.load(Ordering::Relaxed), 2);
680        assert!(stats.lookup_hits.load(Ordering::Relaxed) >= 1);
681        assert!(stats.lookup_misses.load(Ordering::Relaxed) >= 2);
682    }
683
684    #[test]
685    fn test_concurrent_intern_and_resolve() {
686        let interner = Arc::new(LockFreeInterner::new());
687        let mut handles = vec![];
688
689        // Writers: intern new strings
690        for writer_id in 0..4 {
691            let interner = interner.clone();
692            handles.push(thread::spawn(move || {
693                for i in 0..100 {
694                    let s = format!("writer_{}_string_{}", writer_id, i);
695                    interner.get_or_intern(&s);
696                }
697            }));
698        }
699
700        // Readers: continuously resolve existing strings
701        for reader_id in 0..4 {
702            let interner = interner.clone();
703            handles.push(thread::spawn(move || {
704                for i in 0..1000 {
705                    // Intern and immediately resolve
706                    let s = format!("common_string_{}", (i + reader_id) % 10);
707                    let symbol = interner.get_or_intern(&s);
708                    let resolved = interner.resolve(symbol);
709                    assert!(resolved.is_some());
710                }
711            }));
712        }
713
714        for handle in handles {
715            handle.join().unwrap();
716        }
717    }
718
719    #[test]
720    fn test_empty_string() {
721        let interner = LockFreeInterner::new();
722
723        let s = interner.get_or_intern("");
724        assert_eq!(interner.resolve(s), Some(""));
725    }
726
727    #[test]
728    fn test_long_strings() {
729        let interner = LockFreeInterner::new();
730
731        let long_string: String = (0..10000).map(|_| 'x').collect();
732        let symbol = interner.get_or_intern(&long_string);
733        assert_eq!(interner.resolve(symbol), Some(long_string.as_str()));
734    }
735
736    #[test]
737    fn test_unicode_strings() {
738        let interner = LockFreeInterner::new();
739
740        let s1 = interner.get_or_intern("hello 世界 🌍");
741        let s2 = interner.get_or_intern("émojis: 🎉🎊🎁");
742
743        assert_eq!(interner.resolve(s1), Some("hello 世界 🌍"));
744        assert_eq!(interner.resolve(s2), Some("émojis: 🎉🎊🎁"));
745    }
746
747    #[test]
748    fn test_shard_distribution() {
749        let interner = LockFreeInterner::new();
750
751        // Intern many strings
752        for i in 0..10000 {
753            interner.get_or_intern(&format!("key_{}", i));
754        }
755
756        // Check that strings are distributed across shards
757        let mut non_empty = 0;
758        for shard in &interner.shards {
759            if !shard.read().is_empty() {
760                non_empty += 1;
761            }
762        }
763
764        // With 10000 strings across 256 shards, should use many shards
765        assert!(
766            non_empty > 200,
767            "Expected better shard distribution: {} non-empty",
768            non_empty
769        );
770    }
771}