use bumpalo::collections::Vec as BumpVec;
use super::arena::Arena;
const FX_PRIME: u64 = 0x517c_c1b7_2722_0a95;
const INTERN_LENGTH_LIMIT: usize = 64;
const INITIAL_CAPACITY: usize = 256;
#[derive(Debug)]
pub struct Interner<'a> {
arena: &'a Arena,
table: BumpVec<'a, Option<&'a str>>,
mask: usize,
occupied: usize,
last: Option<&'a str>,
pub stats: InternStats,
}
#[derive(Debug, Clone, Copy, Default)]
pub struct InternStats {
pub calls: u64,
pub cache_hits: u64,
pub table_hits: u64,
pub allocs: u64,
pub long_bypass: u64,
pub resizes: u64,
pub probe_steps: u64,
}
impl<'a> Interner<'a> {
#[must_use]
pub fn new_in(arena: &'a Arena) -> Self {
Self::with_capacity_in(INITIAL_CAPACITY, arena)
}
#[must_use]
pub fn with_capacity_in(capacity_hint: usize, arena: &'a Arena) -> Self {
let cap = capacity_hint.next_power_of_two().max(8);
let mut table = BumpVec::with_capacity_in(cap, arena.bump());
table.resize(cap, None);
Self {
arena,
table,
mask: cap - 1,
occupied: 0,
last: None,
stats: InternStats::default(),
}
}
pub fn intern(&mut self, s: &str) -> &'a str {
self.stats.calls += 1;
if let Some(cached) = self.last
&& cached == s
{
self.stats.cache_hits += 1;
return cached;
}
let bytes = s.as_bytes();
if bytes.len() > INTERN_LENGTH_LIMIT {
self.stats.long_bypass += 1;
self.stats.allocs += 1;
let dst = self.arena.alloc_str(s);
self.last = Some(dst);
return dst;
}
let hash = fx_hash(bytes);
if self.occupied.saturating_mul(8) >= self.table.len().saturating_mul(7) {
self.grow();
}
#[allow(
clippy::cast_possible_truncation,
reason = "low bits of u64 hash extracted as usize on purpose"
)]
let mut idx = (hash as usize) & self.mask;
loop {
self.stats.probe_steps += 1;
match self.table[idx] {
Some(existing) if existing == s => {
self.stats.table_hits += 1;
self.last = Some(existing);
return existing;
}
None => {
let dst = self.arena.alloc_str(s);
self.table[idx] = Some(dst);
self.occupied += 1;
self.stats.allocs += 1;
self.last = Some(dst);
return dst;
}
Some(_) => idx = (idx + 1) & self.mask,
}
}
}
#[must_use]
pub fn unique_strings(&self) -> usize {
self.occupied
}
#[must_use]
pub fn capacity(&self) -> usize {
self.table.len()
}
#[must_use]
pub fn avg_probe_length(&self) -> f64 {
let probed = self.stats.calls.saturating_sub(self.stats.cache_hits);
if probed == 0 {
0.0
} else {
#[allow(
clippy::cast_precision_loss,
reason = "probe count fits in f64 mantissa for any plausible workload"
)]
let avg = self.stats.probe_steps as f64 / probed as f64;
avg
}
}
fn grow(&mut self) {
let new_cap = self.table.len().saturating_mul(2);
let new_mask = new_cap - 1;
let mut new_table: BumpVec<'a, Option<&'a str>> =
BumpVec::with_capacity_in(new_cap, self.arena.bump());
new_table.resize(new_cap, None);
for s in self.table.iter().copied().flatten() {
let h = fx_hash(s.as_bytes());
#[allow(
clippy::cast_possible_truncation,
reason = "low bits of u64 hash extracted as usize on purpose"
)]
let mut idx = (h as usize) & new_mask;
while new_table[idx].is_some() {
idx = (idx + 1) & new_mask;
}
new_table[idx] = Some(s);
}
self.table = new_table;
self.mask = new_mask;
self.stats.resizes += 1;
}
}
#[inline]
fn fx_hash(bytes: &[u8]) -> u64 {
let mut h: u64 = 0;
for &b in bytes {
h = h.rotate_left(5) ^ u64::from(b);
h = h.wrapping_mul(FX_PRIME);
}
h
}
#[cfg(test)]
mod tests {
use core::ptr;
use super::*;
fn arena() -> Arena {
Arena::new()
}
#[test]
fn empty_interner_has_no_unique_strings() {
let a = arena();
let i = Interner::new_in(&a);
assert_eq!(i.unique_strings(), 0);
assert_eq!(i.capacity(), INITIAL_CAPACITY);
}
#[test]
fn intern_returns_stable_pointer_for_byte_equal_input() {
let a = arena();
let mut i = Interner::new_in(&a);
let s1 = i.intern("hello");
let s2 = i.intern("hello");
assert!(
ptr::eq(s1.as_ptr(), s2.as_ptr()),
"byte-equal intern must return same arena pointer"
);
assert_eq!(i.unique_strings(), 1);
}
#[test]
fn intern_returns_distinct_pointers_for_distinct_inputs() {
let a = arena();
let mut i = Interner::new_in(&a);
let s1 = i.intern("hello");
let s2 = i.intern("world");
assert!(!ptr::eq(s1.as_ptr(), s2.as_ptr()));
assert_eq!(i.unique_strings(), 2);
}
#[test]
fn intern_handles_empty_string() {
let a = arena();
let mut i = Interner::new_in(&a);
let s1 = i.intern("");
let s2 = i.intern("");
assert_eq!(s1, "");
assert!(ptr::eq(s1.as_ptr(), s2.as_ptr()));
}
#[test]
fn inline_cache_serves_repeated_calls_without_probe() {
let a = arena();
let mut i = Interner::new_in(&a);
for _ in 0..100 {
i.intern("repeated");
}
assert_eq!(i.unique_strings(), 1);
assert!(i.stats.cache_hits >= 99);
}
#[test]
fn long_strings_bypass_table_but_still_alloc() {
let a = arena();
let mut i = Interner::new_in(&a);
let long: String = "x".repeat(128);
let s = i.intern(&long);
assert_eq!(s.len(), 128);
let _ = i.intern(&long);
assert_eq!(
i.stats.long_bypass + i.stats.cache_hits,
2,
"every call must be served by either the cache or the bypass path"
);
assert_eq!(i.stats.long_bypass, 1, "only the first long call bypasses");
assert_eq!(i.stats.cache_hits, 1, "second long call hits the cache");
assert_eq!(i.unique_strings(), 0);
let other: String = "y".repeat(128);
let _ = i.intern(&other);
assert_eq!(i.stats.long_bypass, 2);
}
#[test]
fn many_unique_strings_trigger_resize() {
let a = arena();
let mut i = Interner::with_capacity_in(8, &a);
for k in 0..100 {
let s = format!("unique-string-{k}");
i.intern(&s);
}
assert_eq!(i.unique_strings(), 100);
assert!(i.capacity() >= 128);
assert!(i.stats.resizes >= 4); }
#[test]
fn average_probe_length_stays_low_at_typical_load() {
let a = arena();
let mut i = Interner::new_in(&a);
for k in 0..100 {
let s = format!("k{k}");
i.intern(&s);
}
assert!(
i.avg_probe_length() < 2.0,
"avg probe {} too high — hash function may be degenerate",
i.avg_probe_length()
);
}
#[test]
fn aozora_corpus_short_readings_dedup_aggressively() {
let a = arena();
let mut i = Interner::new_in(&a);
let readings = ["の", "に", "を", "で", "が"];
for _ in 0..200 {
for r in readings {
i.intern(r);
}
}
assert_eq!(i.unique_strings(), 5);
let reuses = i.stats.cache_hits + i.stats.table_hits;
assert!(
reuses >= 995,
"expected >=995 reuses, got {reuses} (calls={})",
i.stats.calls
);
}
#[test]
fn intern_preserves_utf8_bytes_exactly() {
let a = arena();
let mut i = Interner::new_in(&a);
let inputs = ["青梅", "おうめ", "明治の頃", "※[#ほげ]", "🍣"];
for s in inputs {
assert_eq!(i.intern(s), s);
}
assert_eq!(i.unique_strings(), inputs.len());
}
}