use std::collections::HashMap;
use std::fmt;
use std::sync::{Arc, RwLock};
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct SymbolId(pub u32);
impl SymbolId {
pub const NULL: SymbolId = SymbolId(u32::MAX);
#[inline]
pub fn as_u32(self) -> u32 {
self.0
}
#[inline]
pub fn as_usize(self) -> usize {
self.0 as usize
}
#[inline]
pub fn is_null(self) -> bool {
self.0 == u32::MAX
}
}
impl fmt::Display for SymbolId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "SymbolId({})", self.0)
}
}
pub struct StringInterner {
arena: String,
offsets: Vec<usize>,
lengths: Vec<usize>,
map: HashMap<StringKey, SymbolId>,
}
#[derive(Clone)]
struct StringKey {
ptr: *const u8,
len: usize,
}
unsafe impl Send for StringKey {}
unsafe impl Sync for StringKey {}
impl StringKey {
fn as_str(&self) -> &str {
unsafe {
let slice = std::slice::from_raw_parts(self.ptr, self.len);
std::str::from_utf8_unchecked(slice)
}
}
}
impl PartialEq for StringKey {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}
impl Eq for StringKey {}
impl std::hash::Hash for StringKey {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl StringInterner {
pub fn new() -> Self {
StringInterner {
arena: String::with_capacity(4096),
offsets: Vec::new(),
lengths: Vec::new(),
map: HashMap::new(),
}
}
pub fn with_capacity(cap: usize, arena_bytes: usize) -> Self {
StringInterner {
arena: String::with_capacity(arena_bytes),
offsets: Vec::with_capacity(cap),
lengths: Vec::with_capacity(cap),
map: HashMap::with_capacity(cap),
}
}
pub fn intern(&mut self, s: &str) -> SymbolId {
let probe_key = StringKey {
ptr: s.as_ptr(),
len: s.len(),
};
if let Some(&id) = self.map.get(&probe_key) {
return id;
}
let start = self.arena.len();
self.arena.push_str(s);
let end = self.arena.len();
let arena_ptr = self.arena[start..end].as_ptr();
let arena_key = StringKey {
ptr: arena_ptr,
len: s.len(),
};
let raw_id = self.offsets.len();
assert!(
raw_id < u32::MAX as usize - 1,
"StringInterner: SymbolId overflow"
);
let id = SymbolId(raw_id as u32);
self.offsets.push(start);
self.lengths.push(s.len());
self.map.insert(arena_key, id);
id
}
pub fn lookup(&self, id: SymbolId) -> Option<&str> {
let idx = id.as_usize();
if idx >= self.offsets.len() {
return None;
}
let start = self.offsets[idx];
let len = self.lengths[idx];
Some(&self.arena[start..start + len])
}
pub fn contains(&self, s: &str) -> bool {
let probe = StringKey {
ptr: s.as_ptr(),
len: s.len(),
};
self.map.contains_key(&probe)
}
pub fn len(&self) -> usize {
self.offsets.len()
}
pub fn is_empty(&self) -> bool {
self.offsets.is_empty()
}
pub fn iter(&self) -> impl Iterator<Item = (SymbolId, &str)> {
self.offsets
.iter()
.zip(self.lengths.iter())
.enumerate()
.map(|(i, (&start, &len))| (SymbolId(i as u32), &self.arena[start..start + len]))
}
pub fn ids(&self) -> impl Iterator<Item = SymbolId> + '_ {
(0..self.offsets.len()).map(|i| SymbolId(i as u32))
}
pub fn get(&self, s: &str) -> Option<SymbolId> {
let probe = StringKey {
ptr: s.as_ptr(),
len: s.len(),
};
self.map.get(&probe).copied()
}
pub fn arena_bytes(&self) -> usize {
self.arena.len()
}
}
impl fmt::Debug for StringInterner {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("StringInterner")
.field("len", &self.len())
.field("arena_bytes", &self.arena.len())
.finish()
}
}
#[derive(Debug, Clone)]
pub struct SymbolMetadata {
pub interning_count: u32,
pub tag: u64,
}
impl SymbolMetadata {
fn new() -> Self {
SymbolMetadata {
interning_count: 1,
tag: 0,
}
}
}
pub struct SymbolTable {
interner: StringInterner,
metadata: Vec<SymbolMetadata>,
}
impl SymbolTable {
pub fn new() -> Self {
SymbolTable {
interner: StringInterner::new(),
metadata: Vec::new(),
}
}
pub fn intern(&mut self, s: &str) -> SymbolId {
if let Some(id) = self.interner.get(s) {
let idx = id.as_usize();
self.metadata[idx].interning_count =
self.metadata[idx].interning_count.saturating_add(1);
return id;
}
let id = self.interner.intern(s);
self.metadata.push(SymbolMetadata::new());
id
}
pub fn lookup(&self, id: SymbolId) -> Option<&str> {
self.interner.lookup(id)
}
pub fn resolve(&self, id: SymbolId) -> Option<&str> {
self.interner.lookup(id)
}
pub fn get(&self, s: &str) -> Option<SymbolId> {
self.interner.get(s)
}
pub fn contains(&self, s: &str) -> bool {
self.interner.contains(s)
}
pub fn len(&self) -> usize {
self.interner.len()
}
pub fn is_empty(&self) -> bool {
self.interner.is_empty()
}
pub fn metadata(&self, id: SymbolId) -> Option<&SymbolMetadata> {
self.metadata.get(id.as_usize())
}
pub fn metadata_mut(&mut self, id: SymbolId) -> Option<&mut SymbolMetadata> {
self.metadata.get_mut(id.as_usize())
}
pub fn set_tag(&mut self, id: SymbolId, tag: u64) -> bool {
match self.metadata.get_mut(id.as_usize()) {
Some(m) => {
m.tag = tag;
true
}
None => false,
}
}
pub fn get_tag(&self, id: SymbolId) -> Option<u64> {
self.metadata.get(id.as_usize()).map(|m| m.tag)
}
pub fn iter(&self) -> impl Iterator<Item = (SymbolId, &str, &SymbolMetadata)> {
self.interner
.iter()
.zip(self.metadata.iter())
.map(|((id, s), meta)| (id, s, meta))
}
pub fn interner(&self) -> &StringInterner {
&self.interner
}
}
impl fmt::Debug for SymbolTable {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("SymbolTable")
.field("len", &self.len())
.finish()
}
}
#[derive(Debug, Clone)]
pub struct InternerLockError(String);
impl fmt::Display for InternerLockError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "SharedInterner lock poisoned: {}", self.0)
}
}
impl std::error::Error for InternerLockError {}
#[derive(Clone)]
pub struct SharedInterner {
inner: Arc<RwLock<StringInterner>>,
}
impl SharedInterner {
pub fn new() -> Self {
SharedInterner {
inner: Arc::new(RwLock::new(StringInterner::new())),
}
}
pub fn with_capacity(cap: usize, arena_bytes: usize) -> Self {
SharedInterner {
inner: Arc::new(RwLock::new(StringInterner::with_capacity(
cap, arena_bytes,
))),
}
}
pub fn intern(&self, s: &str) -> Result<SymbolId, InternerLockError> {
let mut guard = self
.inner
.write()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.intern(s))
}
pub fn lookup(&self, id: SymbolId) -> Result<Option<String>, InternerLockError> {
let guard = self
.inner
.read()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.lookup(id).map(|s| s.to_owned()))
}
pub fn contains(&self, s: &str) -> Result<bool, InternerLockError> {
let guard = self
.inner
.read()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.contains(s))
}
pub fn len(&self) -> Result<usize, InternerLockError> {
let guard = self
.inner
.read()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.len())
}
pub fn is_empty(&self) -> Result<bool, InternerLockError> {
let guard = self
.inner
.read()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.is_empty())
}
pub fn snapshot(&self) -> Result<Vec<(SymbolId, String)>, InternerLockError> {
let guard = self
.inner
.read()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.iter().map(|(id, s)| (id, s.to_owned())).collect())
}
pub fn get(&self, s: &str) -> Result<Option<SymbolId>, InternerLockError> {
let guard = self
.inner
.read()
.map_err(|e| InternerLockError(e.to_string()))?;
Ok(guard.get(s))
}
}
impl fmt::Debug for SharedInterner {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let len = self.inner.read().map(|g| g.len()).unwrap_or(0);
f.debug_struct("SharedInterner").field("len", &len).finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::thread;
#[test]
fn test_intern_basic() {
let mut interner = StringInterner::new();
let id_a = interner.intern("hello");
let id_b = interner.intern("world");
let id_c = interner.intern("hello");
assert_eq!(id_a, id_c, "same string should produce same ID");
assert_ne!(id_a, id_b, "different strings should produce different IDs");
assert_eq!(interner.len(), 2);
}
#[test]
fn test_lookup_roundtrip() {
let mut interner = StringInterner::new();
let strings = ["alpha", "beta", "gamma", "delta", "epsilon"];
let ids: Vec<_> = strings.iter().map(|s| interner.intern(s)).collect();
for (&id, &expected) in ids.iter().zip(strings.iter()) {
assert_eq!(interner.lookup(id), Some(expected));
}
}
#[test]
fn test_contains() {
let mut interner = StringInterner::new();
interner.intern("present");
assert!(interner.contains("present"));
assert!(!interner.contains("absent"));
}
#[test]
fn test_empty_string() {
let mut interner = StringInterner::new();
let id = interner.intern("");
assert_eq!(interner.lookup(id), Some(""));
assert_eq!(interner.len(), 1);
}
#[test]
fn test_get() {
let mut interner = StringInterner::new();
let id = interner.intern("key");
assert_eq!(interner.get("key"), Some(id));
assert_eq!(interner.get("missing"), None);
}
#[test]
fn test_iter() {
let mut interner = StringInterner::new();
interner.intern("a");
interner.intern("b");
interner.intern("c");
let collected: Vec<_> = interner.iter().collect();
assert_eq!(collected.len(), 3);
assert_eq!(collected[0].1, "a");
assert_eq!(collected[1].1, "b");
assert_eq!(collected[2].1, "c");
}
#[test]
fn test_symbol_table_metadata() {
let mut table = SymbolTable::new();
let id = table.intern("symbol");
table.intern("symbol"); table.intern("symbol");
let meta = table.metadata(id).expect("metadata should exist");
assert_eq!(meta.interning_count, 3);
table.set_tag(id, 0xDEAD_BEEF);
assert_eq!(table.get_tag(id), Some(0xDEAD_BEEF));
}
#[test]
fn test_symbol_table_resolve() {
let mut table = SymbolTable::new();
let id = table.intern("hello");
assert_eq!(table.resolve(id), Some("hello"));
assert_eq!(table.lookup(id), Some("hello"));
}
#[test]
fn test_symbol_table_contains() {
let mut table = SymbolTable::new();
table.intern("yes");
assert!(table.contains("yes"));
assert!(!table.contains("no"));
}
#[test]
fn test_shared_interner_basic() {
let interner = SharedInterner::new();
let id_a = interner.intern("x").expect("lock ok");
let id_b = interner.intern("x").expect("lock ok");
assert_eq!(id_a, id_b);
assert_eq!(
interner.lookup(id_a).expect("lock ok"),
Some("x".to_owned())
);
assert_eq!(interner.len().expect("lock ok"), 1);
}
#[test]
fn test_shared_interner_multithreaded() {
let interner = SharedInterner::new();
let handles: Vec<_> = (0..8)
.map(|i| {
let ic = interner.clone();
thread::spawn(move || {
let _id = ic.intern(&format!("thread-{}", i)).expect("lock ok");
ic.intern("shared").expect("lock ok")
})
})
.collect();
let shared_ids: Vec<_> = handles
.into_iter()
.map(|h| h.join().expect("thread panic"))
.collect();
let first = shared_ids[0];
for id in &shared_ids[1..] {
assert_eq!(*id, first, "shared string should map to the same SymbolId");
}
assert_eq!(interner.len().expect("lock ok"), 9);
}
#[test]
fn test_symbol_id_null() {
assert!(SymbolId::NULL.is_null());
assert!(!SymbolId(0).is_null());
}
#[test]
fn test_lookup_out_of_range() {
let interner = StringInterner::new();
assert_eq!(interner.lookup(SymbolId(0)), None);
assert_eq!(interner.lookup(SymbolId(999)), None);
}
#[test]
fn test_unicode_strings() {
let mut interner = StringInterner::new();
let id1 = interner.intern("日本語");
let id2 = interner.intern("한국어");
let id3 = interner.intern("日本語");
assert_eq!(id1, id3);
assert_ne!(id1, id2);
assert_eq!(interner.lookup(id1), Some("日本語"));
assert_eq!(interner.lookup(id2), Some("한국어"));
}
#[test]
fn test_large_number_of_symbols() {
let mut interner = StringInterner::new();
let n = 10_000usize;
for i in 0..n {
interner.intern(&format!("symbol_{}", i));
}
assert_eq!(interner.len(), n);
for i in 0..n {
interner.intern(&format!("symbol_{}", i));
}
assert_eq!(interner.len(), n);
}
}