use bytes::Bytes;
use crate::graph::rdf::Term;
use crate::index::ring::triple_ring::TermDictionary;
const MAGIC: &[u8; 4] = b"PDCT";
const VERSION: u8 = 1;
const HEADER_SIZE: usize = 16;
#[derive(Debug, Clone)]
pub struct PackedTermDictionary {
string_table: Bytes,
offsets: Bytes,
sorted_ids: Bytes,
count: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PackedDictError {
TruncatedHeader,
BadMagic,
UnsupportedVersion(u8),
InconsistentSizes {
expected_total: usize,
actual_total: usize,
},
InvalidOffsets,
InvalidSortedId(u32),
}
impl std::fmt::Display for PackedDictError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::TruncatedHeader => write!(f, "packed dict header truncated"),
Self::BadMagic => write!(f, "packed dict bad magic (expected 'PDCT')"),
Self::UnsupportedVersion(v) => write!(f, "packed dict unsupported version {v}"),
Self::InconsistentSizes {
expected_total,
actual_total,
} => write!(
f,
"packed dict size mismatch: expected {expected_total} bytes, got {actual_total}"
),
Self::InvalidOffsets => write!(
f,
"packed dict offsets invalid (non-monotonic or sentinel mismatch)"
),
Self::InvalidSortedId(id) => write!(f, "packed dict sorted-id {id} out of range"),
}
}
}
impl std::error::Error for PackedDictError {}
impl PackedTermDictionary {
#[must_use]
pub fn len(&self) -> usize {
self.count
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.count == 0
}
#[must_use]
pub fn get_term_str(&self, id: u32) -> Option<&str> {
let i = id as usize;
if i >= self.count {
return None;
}
let start = usize::try_from(read_u64_at(&self.offsets, i)?).ok()?;
let end = usize::try_from(read_u64_at(&self.offsets, i + 1)?).ok()?;
let bytes = self.string_table.get(start..end)?;
std::str::from_utf8(bytes).ok()
}
#[must_use]
pub fn get_term(&self, id: u32) -> Option<Term> {
let s = self.get_term_str(id)?;
Term::from_ntriples(s)
}
#[must_use]
pub fn get_id_by_str(&self, s: &str) -> Option<u32> {
if self.count == 0 {
return None;
}
let target = s.as_bytes();
let mut lo = 0usize;
let mut hi = self.count;
while lo < hi {
let mid = lo + (hi - lo) / 2;
let id = read_u32_at(&self.sorted_ids, mid)?;
let candidate = self.term_bytes_for(id)?;
match candidate.cmp(target) {
std::cmp::Ordering::Less => lo = mid + 1,
std::cmp::Ordering::Greater => hi = mid,
std::cmp::Ordering::Equal => return Some(id),
}
}
None
}
#[must_use]
pub fn get_id(&self, term: &Term) -> Option<u32> {
let s = term.to_string();
self.get_id_by_str(&s)
}
#[must_use]
pub fn from_term_dict(dict: &TermDictionary) -> Self {
let count = dict.len();
assert!(
u32::try_from(count).is_ok(),
"PackedTermDictionary supports up to u32::MAX terms; got {count}"
);
let mut strings: Vec<String> = Vec::with_capacity(count);
let mut total_bytes = 0usize;
for id in 0..count {
let term = dict
.get_term(u32::try_from(id).expect("count <= u32::MAX checked above"))
.expect("id < len");
let s = term.to_string();
total_bytes += s.len();
strings.push(s);
}
let mut string_table_buf = Vec::with_capacity(total_bytes);
let mut offsets_buf = Vec::with_capacity((count + 1) * 8);
let mut current = 0u64;
for s in &strings {
offsets_buf.extend_from_slice(¤t.to_le_bytes());
string_table_buf.extend_from_slice(s.as_bytes());
current = current.saturating_add(s.len() as u64);
}
offsets_buf.extend_from_slice(¤t.to_le_bytes());
let mut sorted: Vec<u32> = (0..u32::try_from(count).expect("count fits u32")).collect();
sorted.sort_unstable_by(|&a, &b| {
strings[a as usize]
.as_bytes()
.cmp(strings[b as usize].as_bytes())
});
let mut sorted_buf = Vec::with_capacity(count * 4);
for id in &sorted {
sorted_buf.extend_from_slice(&id.to_le_bytes());
}
Self {
string_table: Bytes::from(string_table_buf),
offsets: Bytes::from(offsets_buf),
sorted_ids: Bytes::from(sorted_buf),
count,
}
}
#[must_use]
pub fn to_bytes(&self) -> Vec<u8> {
let total =
HEADER_SIZE + 8 + self.string_table.len() + self.offsets.len() + self.sorted_ids.len();
let mut buf = Vec::with_capacity(total);
buf.extend_from_slice(MAGIC);
buf.push(VERSION);
buf.extend_from_slice(&[0u8; 3]); buf.extend_from_slice(&(self.count as u64).to_le_bytes());
buf.extend_from_slice(&(self.string_table.len() as u64).to_le_bytes());
buf.extend_from_slice(&self.string_table);
buf.extend_from_slice(&self.offsets);
buf.extend_from_slice(&self.sorted_ids);
buf
}
fn read_count_from_header(data: &[u8]) -> Result<usize, PackedDictError> {
let raw = u64::from_le_bytes(data[8..16].try_into().expect("16-byte slice"));
usize::try_from(raw).map_err(|_| PackedDictError::InconsistentSizes {
expected_total: usize::MAX,
actual_total: data.len(),
})
}
pub fn from_bytes(data: Bytes) -> Result<Self, PackedDictError> {
if data.len() < HEADER_SIZE {
return Err(PackedDictError::TruncatedHeader);
}
if &data[0..4] != MAGIC {
return Err(PackedDictError::BadMagic);
}
let version = data[4];
if version != VERSION {
return Err(PackedDictError::UnsupportedVersion(version));
}
let count = Self::read_count_from_header(&data)?;
let mut cursor = HEADER_SIZE;
if cursor + 8 > data.len() {
return Err(PackedDictError::TruncatedHeader);
}
let st_size_raw =
u64::from_le_bytes(data[cursor..cursor + 8].try_into().expect("8-byte slice"));
let st_size =
usize::try_from(st_size_raw).map_err(|_| PackedDictError::InconsistentSizes {
expected_total: usize::MAX,
actual_total: data.len(),
})?;
cursor += 8;
let st_end = cursor
.checked_add(st_size)
.ok_or(PackedDictError::InconsistentSizes {
expected_total: 0,
actual_total: data.len(),
})?;
if st_end > data.len() {
return Err(PackedDictError::InconsistentSizes {
expected_total: st_end,
actual_total: data.len(),
});
}
let string_table = data.slice(cursor..st_end);
cursor = st_end;
let offsets_size = (count + 1) * 8;
let offsets_end =
cursor
.checked_add(offsets_size)
.ok_or(PackedDictError::InconsistentSizes {
expected_total: 0,
actual_total: data.len(),
})?;
if offsets_end > data.len() {
return Err(PackedDictError::InconsistentSizes {
expected_total: offsets_end,
actual_total: data.len(),
});
}
let offsets = data.slice(cursor..offsets_end);
cursor = offsets_end;
let sorted_size = count * 4;
let sorted_end =
cursor
.checked_add(sorted_size)
.ok_or(PackedDictError::InconsistentSizes {
expected_total: 0,
actual_total: data.len(),
})?;
if sorted_end > data.len() {
return Err(PackedDictError::InconsistentSizes {
expected_total: sorted_end,
actual_total: data.len(),
});
}
let sorted_ids = data.slice(cursor..sorted_end);
let dict = Self {
string_table,
offsets,
sorted_ids,
count,
};
dict.validate_offsets()?;
dict.validate_sorted_ids()?;
Ok(dict)
}
fn term_bytes_for(&self, id: u32) -> Option<&[u8]> {
let i = id as usize;
if i >= self.count {
return None;
}
let start = usize::try_from(read_u64_at(&self.offsets, i)?).ok()?;
let end = usize::try_from(read_u64_at(&self.offsets, i + 1)?).ok()?;
self.string_table.get(start..end)
}
fn validate_offsets(&self) -> Result<(), PackedDictError> {
let mut prev: u64 = 0;
for i in 0..=self.count {
let v = read_u64_at(&self.offsets, i).ok_or(PackedDictError::InvalidOffsets)?;
if v < prev {
return Err(PackedDictError::InvalidOffsets);
}
prev = v;
}
let prev_usize = usize::try_from(prev).map_err(|_| PackedDictError::InvalidOffsets)?;
if prev_usize != self.string_table.len() {
return Err(PackedDictError::InvalidOffsets);
}
Ok(())
}
fn validate_sorted_ids(&self) -> Result<(), PackedDictError> {
for i in 0..self.count {
let id = read_u32_at(&self.sorted_ids, i).ok_or(PackedDictError::InvalidOffsets)?;
if id as usize >= self.count {
return Err(PackedDictError::InvalidSortedId(id));
}
}
Ok(())
}
#[must_use]
pub fn approximate_bytes(&self) -> usize {
self.string_table.len() + self.offsets.len() + self.sorted_ids.len()
}
}
fn read_u64_at(bytes: &Bytes, idx: usize) -> Option<u64> {
let start = idx.checked_mul(8)?;
let end = start.checked_add(8)?;
let chunk: [u8; 8] = bytes.get(start..end)?.try_into().ok()?;
Some(u64::from_le_bytes(chunk))
}
fn read_u32_at(bytes: &Bytes, idx: usize) -> Option<u32> {
let start = idx.checked_mul(4)?;
let end = start.checked_add(4)?;
let chunk: [u8; 4] = bytes.get(start..end)?.try_into().ok()?;
Some(u32::from_le_bytes(chunk))
}
#[cfg(test)]
mod tests {
use super::*;
fn build_dict_with(terms: &[Term]) -> TermDictionary {
let mut dict = TermDictionary::new();
for t in terms {
dict.get_or_insert(t.clone());
}
dict
}
#[test]
fn alix_packed_dict_roundtrip_through_bytes() {
let terms = [
Term::iri("http://ex.org/alix"),
Term::iri("http://xmlns.com/foaf/0.1/name"),
Term::literal("Alix"),
Term::iri("http://ex.org/gus"),
Term::literal("Gus"),
];
let dict = build_dict_with(&terms);
let packed = PackedTermDictionary::from_term_dict(&dict);
let bytes = packed.to_bytes();
let restored = PackedTermDictionary::from_bytes(Bytes::from(bytes)).expect("from_bytes");
assert_eq!(restored.len(), terms.len());
for (i, term) in terms.iter().enumerate() {
let id = u32::try_from(i).expect("test fixture id fits u32");
let restored_term = restored.get_term(id).expect("get_term");
assert_eq!(&restored_term, term, "term for id {id}");
}
}
#[test]
fn gus_get_id_by_str_finds_via_binary_search() {
let terms: Vec<Term> = (0..32)
.map(|i| Term::iri(format!("http://ex.org/term-{i:03}")))
.collect();
let dict = build_dict_with(&terms);
let packed = PackedTermDictionary::from_term_dict(&dict);
for (i, term) in terms.iter().enumerate() {
let id = packed
.get_id(term)
.unwrap_or_else(|| panic!("id for {term}"));
let expected = u32::try_from(i).expect("test fixture id fits u32");
assert_eq!(id, expected, "mismatched id for {term}");
}
}
#[test]
fn vincent_get_id_returns_none_for_absent_term() {
let dict = build_dict_with(&[Term::iri("http://a"), Term::iri("http://b")]);
let packed = PackedTermDictionary::from_term_dict(&dict);
assert!(packed.get_id(&Term::iri("http://c")).is_none());
assert!(packed.get_id(&Term::literal("missing")).is_none());
}
#[test]
fn jules_empty_dict_round_trip() {
let dict = TermDictionary::new();
let packed = PackedTermDictionary::from_term_dict(&dict);
assert!(packed.is_empty());
let bytes = packed.to_bytes();
let restored = PackedTermDictionary::from_bytes(Bytes::from(bytes)).expect("empty");
assert!(restored.is_empty());
assert!(restored.get_term(0).is_none());
assert!(restored.get_id_by_str("anything").is_none());
}
#[test]
fn mia_get_term_str_out_of_range_returns_none() {
let dict = build_dict_with(&[Term::iri("http://a")]);
let packed = PackedTermDictionary::from_term_dict(&dict);
assert!(packed.get_term_str(0).is_some());
assert!(packed.get_term_str(1).is_none());
assert!(packed.get_term_str(u32::MAX).is_none());
}
#[test]
fn shosanna_bad_magic_rejected() {
let bad = Bytes::from(vec![0u8; 32]);
let result = PackedTermDictionary::from_bytes(bad);
assert_eq!(result.unwrap_err(), PackedDictError::BadMagic);
}
#[test]
fn beatrix_truncated_header_rejected() {
let short = Bytes::from(vec![b'P', b'D', b'C', b'T']);
let result = PackedTermDictionary::from_bytes(short);
assert_eq!(result.unwrap_err(), PackedDictError::TruncatedHeader);
}
#[test]
fn hans_unsupported_version_rejected() {
let mut buf = Vec::with_capacity(HEADER_SIZE);
buf.extend_from_slice(MAGIC);
buf.push(99); buf.extend_from_slice(&[0u8; 3]);
buf.extend_from_slice(&0u64.to_le_bytes());
let result = PackedTermDictionary::from_bytes(Bytes::from(buf));
assert_eq!(result.unwrap_err(), PackedDictError::UnsupportedVersion(99));
}
#[test]
fn django_inconsistent_size_rejected() {
let mut buf = Vec::new();
buf.extend_from_slice(MAGIC);
buf.push(VERSION);
buf.extend_from_slice(&[0u8; 3]);
buf.extend_from_slice(&5u64.to_le_bytes());
buf.extend_from_slice(&100u64.to_le_bytes()); let result = PackedTermDictionary::from_bytes(Bytes::from(buf));
assert!(matches!(
result.unwrap_err(),
PackedDictError::InconsistentSizes { .. }
));
}
#[test]
fn tarantino_zero_copy_via_bytes_refcount() {
let dict = build_dict_with(&[
Term::iri("http://example.org/alix"),
Term::iri("http://example.org/gus"),
]);
let packed = PackedTermDictionary::from_term_dict(&dict);
let serialized = packed.to_bytes();
let source = Bytes::from(serialized);
let source_ptr = source.as_ptr();
let restored = PackedTermDictionary::from_bytes(source).expect("from_bytes");
let st_ptr = restored.string_table.as_ptr();
let offset = st_ptr as usize - source_ptr as usize;
assert!(
offset < (HEADER_SIZE + 8 + restored.string_table.len() + 256),
"string_table should be inside source allocation; offset={offset}"
);
}
}