use rustc_hash::FxHashMap;
use std::{fmt, sync::Arc};
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub struct StringId(u32);
impl StringId {
pub const MAX: u32 = u32::MAX - 1;
pub const INVALID: StringId = StringId(u32::MAX);
pub fn as_u32(self) -> u32 {
self.0
}
pub fn from_raw(raw: u32) -> Self {
StringId(raw)
}
pub fn is_valid(self) -> bool {
self.0 != u32::MAX
}
}
impl fmt::Display for StringId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "StringId({})", self.0)
}
}
#[derive(Debug)]
pub struct StringInterner {
strings: Vec<Arc<str>>,
lookup: FxHashMap<Arc<str>, StringId>,
}
impl StringInterner {
pub fn new() -> Self {
Self {
strings: Vec::new(),
lookup: FxHashMap::default(),
}
}
pub fn with_capacity(cap: usize) -> Self {
Self {
strings: Vec::with_capacity(cap),
lookup: FxHashMap::with_capacity_and_hasher(cap, Default::default()),
}
}
pub fn intern(&mut self, s: &str) -> StringId {
if let Some(&id) = self.lookup.get(s) {
return id;
}
let index = self.strings.len() as u32;
if index > StringId::MAX {
panic!("String interner overflow: too many strings");
}
let id = StringId(index);
let boxed: Arc<str> = s.into();
self.lookup.insert(boxed.clone(), id);
self.strings.push(boxed);
id
}
#[inline]
pub fn get(&self, id: StringId) -> Option<&str> {
if id.is_valid() {
self.strings.get(id.0 as usize).map(|s| &**s)
} else {
None
}
}
#[inline]
pub fn resolve(&self, id: StringId) -> &str {
self.get(id)
.unwrap_or_else(|| panic!("Invalid string ID: {id:?}"))
}
pub fn contains(&self, s: &str) -> bool {
self.lookup.contains_key(s)
}
pub fn get_id(&self, s: &str) -> Option<StringId> {
self.lookup.get(s).copied()
}
pub fn len(&self) -> usize {
self.strings.len()
}
pub fn is_empty(&self) -> bool {
self.strings.is_empty()
}
pub fn memory_usage(&self) -> usize {
let vec_overhead = self.strings.capacity() * std::mem::size_of::<Box<str>>();
let string_data: usize = self
.strings
.iter()
.map(|s| s.len() + std::mem::size_of::<Box<str>>())
.sum();
let map_overhead = self.lookup.capacity()
* (std::mem::size_of::<Box<str>>() + std::mem::size_of::<StringId>());
vec_overhead + string_data + map_overhead
}
pub fn clear(&mut self) {
self.strings.clear();
self.lookup.clear();
}
pub fn iter(&self) -> impl Iterator<Item = (StringId, &str)> + '_ {
self.strings
.iter()
.enumerate()
.map(|(i, s)| (StringId(i as u32), &**s))
}
pub fn stats(&self) -> InternerStats {
let total_bytes: usize = self.strings.iter().map(|s| s.len()).sum();
let unique_count = self.strings.len();
InternerStats {
unique_count,
total_bytes,
average_length: if unique_count > 0 {
total_bytes as f64 / unique_count as f64
} else {
0.0
},
}
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct InternerStats {
pub unique_count: usize,
pub total_bytes: usize,
pub average_length: f64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_interning() {
let mut interner = StringInterner::new();
let s1 = interner.intern("Hello");
let s2 = interner.intern("Hello");
let s3 = interner.intern("World");
assert_eq!(s1, s2);
assert_ne!(s1, s3);
assert_eq!(interner.get(s1), Some("Hello"));
assert_eq!(interner.get(s3), Some("World"));
}
#[test]
fn test_string_memory_efficiency() {
let mut interner = StringInterner::new();
let refs: Vec<_> = (0..1000).map(|_| interner.intern("repeated")).collect();
assert_eq!(interner.len(), 1);
for r in &refs[1..] {
assert_eq!(*r, refs[0]);
}
assert!(interner.memory_usage() < 1000);
}
#[test]
fn test_string_interner_capacity() {
let mut interner = StringInterner::with_capacity(100);
for i in 0..1000 {
let s = format!("string_{i}");
let id = interner.intern(&s);
assert_eq!(interner.get(id), Some(s.as_str()));
}
assert_eq!(interner.len(), 1000);
}
#[test]
fn test_string_interner_contains() {
let mut interner = StringInterner::new();
interner.intern("exists");
assert!(interner.contains("exists"));
assert!(!interner.contains("not_exists"));
}
#[test]
fn test_string_interner_get_id() {
let mut interner = StringInterner::new();
let id = interner.intern("test");
assert_eq!(interner.get_id("test"), Some(id));
assert_eq!(interner.get_id("not_interned"), None);
}
#[test]
fn test_string_interner_clear() {
let mut interner = StringInterner::new();
interner.intern("one");
interner.intern("two");
interner.intern("three");
assert_eq!(interner.len(), 3);
interner.clear();
assert_eq!(interner.len(), 0);
assert!(interner.is_empty());
}
#[test]
fn test_string_interner_iter() {
let mut interner = StringInterner::new();
let id1 = interner.intern("first");
let id2 = interner.intern("second");
let id3 = interner.intern("third");
let items: Vec<_> = interner.iter().collect();
assert_eq!(items.len(), 3);
assert_eq!(items[0], (id1, "first"));
assert_eq!(items[1], (id2, "second"));
assert_eq!(items[2], (id3, "third"));
}
#[test]
fn test_string_interner_stats() {
let mut interner = StringInterner::new();
interner.intern("short");
interner.intern("medium_length");
interner.intern("this_is_a_longer_string");
let stats = interner.stats();
assert_eq!(stats.unique_count, 3);
assert_eq!(stats.total_bytes, 5 + 13 + 23);
assert!((stats.average_length - 13.67).abs() < 0.01);
}
#[test]
fn test_invalid_string_id() {
let interner = StringInterner::new();
let invalid = StringId::INVALID;
assert_eq!(invalid.0, u32::MAX);
assert!(!StringId::INVALID.is_valid());
assert_eq!(interner.get(StringId::INVALID), None);
}
#[test]
#[should_panic(expected = "Invalid string ID")]
fn test_resolve_invalid_id() {
let interner = StringInterner::new();
interner.resolve(StringId::INVALID);
}
#[test]
fn test_string_id_ordering() {
let mut interner = StringInterner::new();
let id1 = interner.intern("a");
let id2 = interner.intern("b");
let id3 = interner.intern("c");
assert!(id1 < id2);
assert!(id2 < id3);
assert!(id1 < id3);
}
#[test]
fn test_empty_string() {
let mut interner = StringInterner::new();
let id = interner.intern("");
assert_eq!(interner.get(id), Some(""));
let id2 = interner.intern("");
assert_eq!(id, id2);
}
#[test]
fn test_unicode_strings() {
let mut interner = StringInterner::new();
let id1 = interner.intern("Hello δΈη");
let id2 = interner.intern("π¦ Rust");
let id3 = interner.intern("Hello δΈη");
assert_eq!(id1, id3);
assert_ne!(id1, id2);
assert_eq!(interner.get(id1), Some("Hello δΈη"));
assert_eq!(interner.get(id2), Some("π¦ Rust"));
}
}