use parking_lot::RwLock as ParkingLotRwLock;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::RwLock;
use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicUsize, Ordering, fence};
const SHARD_COUNT: usize = 256;
const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct Symbol(pub u32);
impl Symbol {
pub const NULL: Symbol = Symbol(u32::MAX);
pub fn index(self) -> u32 {
self.0
}
}
#[derive(Clone, Copy)]
struct StringRef {
chunk_idx: u32,
offset: u32,
length: u32,
}
struct Chunk {
data: *mut u8,
capacity: usize,
write_pos: AtomicUsize,
next: AtomicPtr<Chunk>,
}
impl Chunk {
fn new(capacity: usize) -> Box<Self> {
let buffer: Vec<u8> = vec![0u8; capacity];
let ptr = Box::into_raw(buffer.into_boxed_slice()) as *mut u8;
Box::new(Self {
data: ptr,
capacity,
write_pos: AtomicUsize::new(0),
next: AtomicPtr::new(std::ptr::null_mut()),
})
}
fn try_append(&self, s: &str) -> Option<u32> {
let bytes = s.as_bytes();
let len = bytes.len();
if len == 0 {
return Some(0);
}
let offset = self.write_pos.fetch_add(len, Ordering::Relaxed);
if offset + len > self.capacity {
self.write_pos.fetch_sub(len, Ordering::Relaxed);
return None;
}
unsafe {
std::ptr::copy_nonoverlapping(bytes.as_ptr(), self.data.add(offset), len);
}
fence(Ordering::Release);
Some(offset as u32)
}
fn read(&self, offset: u32, len: u32) -> &str {
fence(Ordering::Acquire);
unsafe {
let bytes = std::slice::from_raw_parts(self.data.add(offset as usize), len as usize);
std::str::from_utf8_unchecked(bytes)
}
}
}
impl Drop for Chunk {
fn drop(&mut self) {
unsafe {
let slice = std::slice::from_raw_parts_mut(self.data, self.capacity);
drop(Box::from_raw(slice as *mut [u8]));
}
}
}
pub struct ChunkedStorage {
head: AtomicPtr<Chunk>,
current: AtomicPtr<Chunk>,
chunk_size: usize,
chunk_count: AtomicU32,
}
impl ChunkedStorage {
fn new(chunk_size: usize) -> Self {
let initial = Box::into_raw(Chunk::new(chunk_size));
Self {
head: AtomicPtr::new(initial),
current: AtomicPtr::new(initial),
chunk_size,
chunk_count: AtomicU32::new(1),
}
}
fn append(&self, s: &str) -> StringRef {
loop {
let current = self.current.load(Ordering::Acquire);
let chunk = unsafe { &*current };
if let Some(offset) = chunk.try_append(s) {
let chunk_idx = self.get_chunk_index(current);
return StringRef {
chunk_idx,
offset,
length: s.len() as u32,
};
}
self.grow_chunk(current);
}
}
fn grow_chunk(&self, expected_current: *mut Chunk) {
let new_chunk = Box::into_raw(Chunk::new(self.chunk_size));
let current = unsafe { &*expected_current };
if current
.next
.compare_exchange(
std::ptr::null_mut(),
new_chunk,
Ordering::AcqRel,
Ordering::Relaxed,
)
.is_ok()
{
self.current.store(new_chunk, Ordering::Release);
self.chunk_count.fetch_add(1, Ordering::Relaxed);
} else {
unsafe {
drop(Box::from_raw(new_chunk));
}
let actual_next = current.next.load(Ordering::Acquire);
if !actual_next.is_null() {
self.current.store(actual_next, Ordering::Release);
}
}
}
fn get_chunk(&self, idx: u32) -> Option<&Chunk> {
let mut current = self.head.load(Ordering::Acquire);
let mut i = 0u32;
while !current.is_null() {
if i == idx {
return Some(unsafe { &*current });
}
current = unsafe { (*current).next.load(Ordering::Acquire) };
i += 1;
}
None
}
fn get_chunk_index(&self, ptr: *mut Chunk) -> u32 {
let mut current = self.head.load(Ordering::Acquire);
let mut i = 0u32;
while !current.is_null() {
if current == ptr {
return i;
}
current = unsafe { (*current).next.load(Ordering::Acquire) };
i += 1;
}
0 }
fn read(&self, string_ref: StringRef) -> Option<&str> {
let chunk = self.get_chunk(string_ref.chunk_idx)?;
Some(chunk.read(string_ref.offset, string_ref.length))
}
}
impl Drop for ChunkedStorage {
fn drop(&mut self) {
let mut current = self.head.load(Ordering::Relaxed);
while !current.is_null() {
let chunk = unsafe { Box::from_raw(current) };
current = chunk.next.load(Ordering::Relaxed);
}
}
}
fn hash_string(s: &str) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
s.hash(&mut hasher);
hasher.finish()
}
pub struct LockFreeInterner {
shards: [ParkingLotRwLock<HashMap<u64, (Symbol, StringRef)>>; SHARD_COUNT],
storage: ChunkedStorage,
next_symbol: AtomicU32,
symbols: RwLock<Vec<StringRef>>,
stats: InternerStats,
}
#[derive(Default)]
pub struct InternerStats {
pub interned_count: AtomicU32,
pub lookup_hits: AtomicU32,
pub lookup_misses: AtomicU32,
pub resolve_count: AtomicU32,
}
impl Default for LockFreeInterner {
fn default() -> Self {
Self::new()
}
}
impl LockFreeInterner {
pub fn new() -> Self {
Self::with_chunk_size(DEFAULT_CHUNK_SIZE)
}
pub fn with_chunk_size(chunk_size: usize) -> Self {
Self {
shards: std::array::from_fn(|_| ParkingLotRwLock::new(HashMap::new())),
storage: ChunkedStorage::new(chunk_size),
next_symbol: AtomicU32::new(0),
symbols: RwLock::new(Vec::new()),
stats: InternerStats::default(),
}
}
#[inline]
fn shard_index(&self, hash: u64) -> usize {
(hash as usize) % SHARD_COUNT
}
pub fn get_or_intern(&self, s: &str) -> Symbol {
let hash = hash_string(s);
let shard_idx = self.shard_index(hash);
{
let shard = self.shards[shard_idx].read();
if let Some(&(symbol, _)) = shard.get(&hash) {
self.stats.lookup_hits.fetch_add(1, Ordering::Relaxed);
return symbol;
}
}
self.stats.lookup_misses.fetch_add(1, Ordering::Relaxed);
let mut shard = self.shards[shard_idx].write();
if let Some(&(symbol, _)) = shard.get(&hash) {
return symbol;
}
let symbol = Symbol(self.next_symbol.fetch_add(1, Ordering::SeqCst));
let string_ref = self.storage.append(s);
shard.insert(hash, (symbol, string_ref));
{
let mut symbols = self.symbols.write().unwrap();
while symbols.len() <= symbol.0 as usize {
symbols.push(StringRef {
chunk_idx: 0,
offset: 0,
length: 0,
});
}
symbols[symbol.0 as usize] = string_ref;
}
self.stats.interned_count.fetch_add(1, Ordering::Relaxed);
symbol
}
pub fn resolve(&self, symbol: Symbol) -> Option<&str> {
if symbol == Symbol::NULL {
return None;
}
self.stats.resolve_count.fetch_add(1, Ordering::Relaxed);
let string_ref = {
let symbols = self.symbols.read().unwrap();
symbols.get(symbol.0 as usize).copied()
}?;
self.storage.read(string_ref)
}
pub fn len(&self) -> usize {
self.next_symbol.load(Ordering::Relaxed) as usize
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn stats(&self) -> &InternerStats {
&self.stats
}
pub fn storage_bytes(&self) -> usize {
self.storage.chunk_count.load(Ordering::Relaxed) as usize * self.storage.chunk_size
}
}
unsafe impl Send for LockFreeInterner {}
unsafe impl Sync for LockFreeInterner {}
unsafe impl Send for ChunkedStorage {}
unsafe impl Sync for ChunkedStorage {}
unsafe impl Send for Chunk {}
unsafe impl Sync for Chunk {}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
use std::thread;
#[test]
fn test_basic_intern() {
let interner = LockFreeInterner::new();
let s1 = interner.get_or_intern("hello");
let s2 = interner.get_or_intern("world");
let s3 = interner.get_or_intern("hello");
assert_eq!(s1, s3);
assert_ne!(s1, s2);
assert_eq!(interner.len(), 2);
}
#[test]
fn test_resolve() {
let interner = LockFreeInterner::new();
let s1 = interner.get_or_intern("hello");
let s2 = interner.get_or_intern("world");
assert_eq!(interner.resolve(s1), Some("hello"));
assert_eq!(interner.resolve(s2), Some("world"));
assert_eq!(interner.resolve(Symbol::NULL), None);
}
#[test]
fn test_concurrent_intern() {
let interner = Arc::new(LockFreeInterner::new());
let mut handles = vec![];
for _ in 0..8 {
let interner = interner.clone();
handles.push(thread::spawn(move || {
let mut symbols = Vec::new();
for i in 0..100 {
let s = format!("string_{}", i);
symbols.push(interner.get_or_intern(&s));
}
symbols
}));
}
let results: Vec<_> = handles.into_iter().map(|h| h.join().unwrap()).collect();
for i in 0..100 {
let first = results[0][i];
for result in &results[1..] {
assert_eq!(result[i], first, "Symbol mismatch for string_{}", i);
}
}
assert_eq!(interner.len(), 100);
}
#[test]
fn test_concurrent_resolve() {
let interner = Arc::new(LockFreeInterner::new());
let symbols: Vec<_> = (0..100)
.map(|i| interner.get_or_intern(&format!("string_{}", i)))
.collect();
let mut handles = vec![];
for _ in 0..8 {
let interner = interner.clone();
let symbols = symbols.clone();
handles.push(thread::spawn(move || {
for _ in 0..100 {
for (i, &symbol) in symbols.iter().enumerate() {
let resolved = interner.resolve(symbol);
assert_eq!(resolved, Some(format!("string_{}", i)).as_deref());
}
}
}));
}
for handle in handles {
handle.join().unwrap();
}
assert!(interner.stats().resolve_count.load(Ordering::Relaxed) > 0);
}
#[test]
fn test_chunk_growth() {
let interner = LockFreeInterner::with_chunk_size(1024);
for i in 0..1000 {
let s = format!("this_is_a_longer_string_number_{:05}", i);
interner.get_or_intern(&s);
}
assert!(interner.storage.chunk_count.load(Ordering::Relaxed) > 1);
for i in 0..1000 {
let s = format!("this_is_a_longer_string_number_{:05}", i);
let symbol = interner.get_or_intern(&s);
assert_eq!(interner.resolve(symbol), Some(s.as_str()));
}
}
#[test]
fn test_symbol_comparison() {
let interner = LockFreeInterner::new();
let s1 = interner.get_or_intern("hello");
let s2 = interner.get_or_intern("world");
let s3 = interner.get_or_intern("hello");
assert_eq!(s1, s3);
assert_ne!(s1, s2);
let mut map = HashMap::new();
map.insert(s1, "value1");
map.insert(s2, "value2");
assert_eq!(map.get(&s3), Some(&"value1"));
}
#[test]
fn test_stats() {
let interner = LockFreeInterner::new();
interner.get_or_intern("hello");
interner.get_or_intern("world");
interner.get_or_intern("hello");
let stats = interner.stats();
assert_eq!(stats.interned_count.load(Ordering::Relaxed), 2);
assert!(stats.lookup_hits.load(Ordering::Relaxed) >= 1);
assert!(stats.lookup_misses.load(Ordering::Relaxed) >= 2);
}
#[test]
fn test_concurrent_intern_and_resolve() {
let interner = Arc::new(LockFreeInterner::new());
let mut handles = vec![];
for writer_id in 0..4 {
let interner = interner.clone();
handles.push(thread::spawn(move || {
for i in 0..100 {
let s = format!("writer_{}_string_{}", writer_id, i);
interner.get_or_intern(&s);
}
}));
}
for reader_id in 0..4 {
let interner = interner.clone();
handles.push(thread::spawn(move || {
for i in 0..1000 {
let s = format!("common_string_{}", (i + reader_id) % 10);
let symbol = interner.get_or_intern(&s);
let resolved = interner.resolve(symbol);
assert!(resolved.is_some());
}
}));
}
for handle in handles {
handle.join().unwrap();
}
}
#[test]
fn test_empty_string() {
let interner = LockFreeInterner::new();
let s = interner.get_or_intern("");
assert_eq!(interner.resolve(s), Some(""));
}
#[test]
fn test_long_strings() {
let interner = LockFreeInterner::new();
let long_string: String = (0..10000).map(|_| 'x').collect();
let symbol = interner.get_or_intern(&long_string);
assert_eq!(interner.resolve(symbol), Some(long_string.as_str()));
}
#[test]
fn test_unicode_strings() {
let interner = LockFreeInterner::new();
let s1 = interner.get_or_intern("hello 世界 🌍");
let s2 = interner.get_or_intern("émojis: 🎉🎊🎁");
assert_eq!(interner.resolve(s1), Some("hello 世界 🌍"));
assert_eq!(interner.resolve(s2), Some("émojis: 🎉🎊🎁"));
}
#[test]
fn test_shard_distribution() {
let interner = LockFreeInterner::new();
for i in 0..10000 {
interner.get_or_intern(&format!("key_{}", i));
}
let mut non_empty = 0;
for shard in &interner.shards {
if !shard.read().is_empty() {
non_empty += 1;
}
}
assert!(
non_empty > 200,
"Expected better shard distribution: {} non-empty",
non_empty
);
}
}