use crate::error::StorageError;
use std::collections::HashMap;
use std::io::Write;
#[derive(Debug, Clone)]
pub struct StringInterner {
forward: HashMap<String, u32>,
reverse: HashMap<u32, String>,
next_id: u32,
}
impl StringInterner {
pub fn new() -> Self {
StringInterner {
forward: HashMap::new(),
reverse: HashMap::new(),
next_id: 0,
}
}
pub fn intern(&mut self, value: &str) -> u32 {
if let Some(id) = self.forward.get(value) {
return *id;
}
let id = self.next_id;
self.forward.insert(value.to_owned(), id);
self.reverse.insert(id, value.to_owned());
self.next_id = self
.next_id
.checked_add(1)
.expect("string interner id overflow");
id
}
pub fn resolve(&self, id: u32) -> Option<&str> {
self.reverse.get(&id).map(|s| s.as_str())
}
pub fn lookup(&self, value: &str) -> Option<u32> {
self.forward.get(value).copied()
}
pub fn len(&self) -> usize {
self.forward.len()
}
pub fn is_empty(&self) -> bool {
self.forward.is_empty()
}
pub fn load_from_mmap(mmap: &[u8], offset: u64, end: u64) -> Result<Self, StorageError> {
let mut interner = StringInterner::new();
let mut pos = offset as usize;
let end_pos = end as usize;
if end_pos > mmap.len() {
return Err(StorageError::CorruptedData);
}
while pos < end_pos {
if pos + 8 > end_pos {
return Err(StorageError::CorruptedData);
}
let id = u32::from_le_bytes([mmap[pos], mmap[pos + 1], mmap[pos + 2], mmap[pos + 3]]);
let len =
u32::from_le_bytes([mmap[pos + 4], mmap[pos + 5], mmap[pos + 6], mmap[pos + 7]])
as usize;
pos += 8;
if pos + len > end_pos {
return Err(StorageError::CorruptedData);
}
let string_bytes = &mmap[pos..pos + len];
let string = std::str::from_utf8(string_bytes)
.map_err(|_| StorageError::CorruptedData)?
.to_string();
pos += len;
interner.forward.insert(string.clone(), id);
interner.reverse.insert(id, string);
if id >= interner.next_id {
interner.next_id = id + 1;
}
}
Ok(interner)
}
pub fn write_to_file<W: Write>(&self, writer: &mut W) -> Result<u64, StorageError> {
let mut total_bytes = 0u64;
let mut entries: Vec<_> = self.reverse.iter().collect();
entries.sort_by_key(|(id, _)| *id);
for (id, string) in entries {
let string_bytes = string.as_bytes();
let len = string_bytes.len() as u32;
writer.write_all(&id.to_le_bytes())?;
writer.write_all(&len.to_le_bytes())?;
total_bytes += 8;
writer.write_all(string_bytes)?;
total_bytes += len as u64;
}
Ok(total_bytes)
}
}
impl Default for StringInterner {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn interns_and_resolves_strings() {
let mut interner = StringInterner::new();
let first = interner.intern("person");
let second = interner.intern("person");
let third = interner.intern("knows");
assert_eq!(first, second);
assert_ne!(first, third);
assert_eq!(interner.resolve(first), Some("person"));
assert_eq!(interner.resolve(third), Some("knows"));
}
#[test]
fn lookup_without_interning() {
let mut interner = StringInterner::new();
interner.intern("exists");
assert_eq!(interner.lookup("exists"), Some(0));
assert_eq!(interner.lookup("missing"), None);
}
#[test]
fn len_and_is_empty() {
let mut interner = StringInterner::new();
assert!(interner.is_empty());
assert_eq!(interner.len(), 0);
interner.intern("one");
interner.intern("two");
interner.intern("one");
assert!(!interner.is_empty());
assert_eq!(interner.len(), 2);
}
#[test]
fn test_write_and_load_empty_string_table() {
let interner = StringInterner::new();
let mut buffer = Vec::new();
let bytes_written = interner.write_to_file(&mut buffer).unwrap();
assert_eq!(bytes_written, 0);
assert_eq!(buffer.len(), 0);
let loaded = StringInterner::load_from_mmap(&buffer, 0, 0).unwrap();
assert!(loaded.is_empty());
assert_eq!(loaded.len(), 0);
}
#[test]
fn test_write_and_load_single_string() {
let mut interner = StringInterner::new();
interner.intern("person");
let mut buffer = Vec::new();
let bytes_written = interner.write_to_file(&mut buffer).unwrap();
assert_eq!(bytes_written, 14);
assert_eq!(buffer.len(), 14);
let loaded = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64).unwrap();
assert_eq!(loaded.len(), 1);
assert_eq!(loaded.resolve(0), Some("person"));
assert_eq!(loaded.lookup("person"), Some(0));
}
#[test]
fn test_write_and_load_multiple_strings() {
let mut interner = StringInterner::new();
let id1 = interner.intern("person");
let id2 = interner.intern("software");
let id3 = interner.intern("knows");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
let loaded = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64).unwrap();
assert_eq!(loaded.len(), 3);
assert_eq!(loaded.resolve(id1), Some("person"));
assert_eq!(loaded.resolve(id2), Some("software"));
assert_eq!(loaded.resolve(id3), Some("knows"));
assert_eq!(loaded.lookup("person"), Some(id1));
assert_eq!(loaded.lookup("software"), Some(id2));
assert_eq!(loaded.lookup("knows"), Some(id3));
}
#[test]
fn test_write_preserves_id_order() {
let mut interner = StringInterner::new();
interner.intern("zzz"); interner.intern("aaa"); interner.intern("mmm");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
let loaded = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64).unwrap();
assert_eq!(loaded.resolve(0), Some("zzz"));
assert_eq!(loaded.resolve(1), Some("aaa"));
assert_eq!(loaded.resolve(2), Some("mmm"));
}
#[test]
fn test_load_from_offset() {
let mut interner = StringInterner::new();
interner.intern("first");
interner.intern("second");
let mut buffer = vec![0u8; 100]; interner.write_to_file(&mut buffer).unwrap();
let offset = 100u64;
let end = buffer.len() as u64;
let loaded = StringInterner::load_from_mmap(&buffer, offset, end).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded.resolve(0), Some("first"));
assert_eq!(loaded.resolve(1), Some("second"));
}
#[test]
fn test_load_with_utf8_strings() {
let mut interner = StringInterner::new();
interner.intern("hello");
interner.intern("δΈη"); interner.intern("π¦");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
let loaded = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64).unwrap();
assert_eq!(loaded.len(), 3);
assert_eq!(loaded.resolve(0), Some("hello"));
assert_eq!(loaded.resolve(1), Some("δΈη"));
assert_eq!(loaded.resolve(2), Some("π¦"));
}
#[test]
fn test_load_corrupted_truncated_header() {
let mut interner = StringInterner::new();
interner.intern("test");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
buffer.truncate(4);
let result = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StorageError::CorruptedData));
}
#[test]
fn test_load_corrupted_truncated_string() {
let mut interner = StringInterner::new();
interner.intern("test");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
buffer.truncate(8);
let result = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StorageError::CorruptedData));
}
#[test]
fn test_load_corrupted_invalid_utf8() {
let mut buffer = Vec::new();
buffer.extend_from_slice(&0u32.to_le_bytes());
buffer.extend_from_slice(&2u32.to_le_bytes());
buffer.push(0xFF);
buffer.push(0xFF);
let result = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StorageError::CorruptedData));
}
#[test]
fn test_load_corrupted_offset_beyond_end() {
let buffer = vec![0u8; 100];
let result = StringInterner::load_from_mmap(&buffer, 0, 200);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StorageError::CorruptedData));
}
#[test]
fn test_next_id_updated_after_load() {
let mut interner = StringInterner::new();
interner.intern("first"); interner.intern("second"); interner.intern("third");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
let mut loaded = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64).unwrap();
let new_id = loaded.intern("fourth");
assert_eq!(new_id, 3);
}
#[test]
fn test_deduplication_after_load() {
let mut interner = StringInterner::new();
interner.intern("person");
let mut buffer = Vec::new();
interner.write_to_file(&mut buffer).unwrap();
let mut loaded = StringInterner::load_from_mmap(&buffer, 0, buffer.len() as u64).unwrap();
let id = loaded.intern("person");
assert_eq!(id, 0);
assert_eq!(loaded.len(), 1); }
}