use std::collections::HashMap;
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::RwLock;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[repr(transparent)]
pub struct InternedId(u32);
impl InternedId {
#[inline]
pub fn as_u32(self) -> u32 {
self.0
}
#[inline]
pub fn from_raw(id: u32) -> Self {
InternedId(id)
}
}
#[derive(Debug, Clone, Default)]
pub struct InternerStats {
pub unique_count: usize,
pub total_intern_calls: usize,
pub estimated_memory_bytes: usize,
}
impl InternerStats {
pub fn dedup_ratio(&self) -> f64 {
if self.total_intern_calls == 0 {
return 0.0;
}
let duplicates = self.total_intern_calls.saturating_sub(self.unique_count);
duplicates as f64 / self.total_intern_calls as f64
}
}
#[derive(Debug, Default)]
pub struct StringInterner {
strings: Vec<String>,
lookup: HashMap<String, u32>,
total_intern_calls: usize,
}
impl StringInterner {
pub fn new() -> Self {
Self::default()
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
strings: Vec::with_capacity(capacity),
lookup: HashMap::with_capacity(capacity),
total_intern_calls: 0,
}
}
pub fn intern(&mut self, s: &str) -> InternedId {
self.total_intern_calls += 1;
if let Some(&id) = self.lookup.get(s) {
return InternedId(id);
}
let id = self.strings.len() as u32;
self.strings.push(s.to_string());
self.lookup.insert(s.to_string(), id);
InternedId(id)
}
#[inline]
pub fn get(&self, id: InternedId) -> Option<&str> {
self.strings.get(id.0 as usize).map(|s| s.as_str())
}
#[inline]
pub fn get_or_intern(&mut self, s: &str) -> InternedId {
self.intern(s)
}
#[inline]
pub fn len(&self) -> usize {
self.strings.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.strings.is_empty()
}
pub fn stats(&self) -> InternerStats {
let estimated_memory_bytes = self
.strings
.iter()
.map(|s| s.len() + std::mem::size_of::<String>())
.sum::<usize>()
+ self.lookup.capacity() * (std::mem::size_of::<String>() + std::mem::size_of::<u32>());
InternerStats {
unique_count: self.strings.len(),
total_intern_calls: self.total_intern_calls,
estimated_memory_bytes,
}
}
}
#[derive(Debug, Default)]
pub struct PathInterner {
inner: StringInterner,
}
impl PathInterner {
pub fn new() -> Self {
Self::default()
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
inner: StringInterner::with_capacity(capacity),
}
}
pub fn intern_path(&mut self, path: &Path) -> InternedId {
let normalized = normalize_path(path);
self.inner.intern(&normalized)
}
#[inline]
pub fn get_path(&self, id: InternedId) -> Option<&str> {
self.inner.get(id)
}
#[inline]
pub fn len(&self) -> usize {
self.inner.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
pub fn stats(&self) -> InternerStats {
self.inner.stats()
}
}
fn normalize_path(path: &Path) -> String {
path.to_string_lossy().replace('\\', "/")
}
#[derive(Debug)]
pub struct ConcurrentInterner {
strings: RwLock<Vec<String>>,
lookup: RwLock<HashMap<String, u32>>,
total_intern_calls: AtomicUsize,
}
impl ConcurrentInterner {
pub fn new() -> Self {
Self {
strings: RwLock::new(Vec::new()),
lookup: RwLock::new(HashMap::new()),
total_intern_calls: AtomicUsize::new(0),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
strings: RwLock::new(Vec::with_capacity(capacity)),
lookup: RwLock::new(HashMap::with_capacity(capacity)),
total_intern_calls: AtomicUsize::new(0),
}
}
pub fn intern(&self, s: &str) -> InternedId {
self.total_intern_calls.fetch_add(1, Ordering::Relaxed);
{
let lookup = self.lookup.read().unwrap();
if let Some(&id) = lookup.get(s) {
return InternedId(id);
}
}
let mut lookup = self.lookup.write().unwrap();
if let Some(&id) = lookup.get(s) {
return InternedId(id);
}
let mut strings = self.strings.write().unwrap();
let id = strings.len() as u32;
strings.push(s.to_string());
lookup.insert(s.to_string(), id);
InternedId(id)
}
pub fn get(&self, id: InternedId) -> Option<String> {
let strings = self.strings.read().unwrap();
strings.get(id.0 as usize).cloned()
}
pub fn len(&self) -> usize {
self.strings.read().unwrap().len()
}
pub fn is_empty(&self) -> bool {
self.strings.read().unwrap().is_empty()
}
pub fn stats(&self) -> InternerStats {
let strings = self.strings.read().unwrap();
let lookup = self.lookup.read().unwrap();
let estimated_memory_bytes = strings
.iter()
.map(|s| s.len() + std::mem::size_of::<String>())
.sum::<usize>()
+ lookup.capacity() * (std::mem::size_of::<String>() + std::mem::size_of::<u32>());
InternerStats {
unique_count: strings.len(),
total_intern_calls: self.total_intern_calls.load(Ordering::Relaxed),
estimated_memory_bytes,
}
}
}
impl Default for ConcurrentInterner {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_path_backslash() {
let path = Path::new("src\\main\\lib.rs");
assert_eq!(normalize_path(path), "src/main/lib.rs");
}
#[test]
fn test_normalize_path_already_normalized() {
let path = Path::new("src/main/lib.rs");
assert_eq!(normalize_path(path), "src/main/lib.rs");
}
#[test]
fn test_normalize_path_mixed() {
let path = Path::new("src\\main/lib.rs");
assert_eq!(normalize_path(path), "src/main/lib.rs");
}
#[test]
fn test_normalize_path_absolute_windows() {
let path = Path::new("C:\\Users\\project\\main.rs");
assert_eq!(normalize_path(path), "C:/Users/project/main.rs");
}
}