use crate::{StarError, StarResult, StarTerm, StarTriple};
use scirs2_core::profiling::Profiler;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::io::{Read, Write};
use tracing::info;
pub const HDTV2_MAGIC: [u8; 8] = *b"HDT*RD2\0";
pub const FC_BLOCK_SIZE: usize = 16;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HdtV2Header {
pub version: u8,
pub triple_count: u64,
pub subject_count: u64,
pub predicate_count: u64,
pub object_count: u64,
pub quoted_count: u64,
pub compression_flags: u32,
}
impl HdtV2Header {
pub const FLAG_FRONT_CODING: u32 = 0x01;
pub const FLAG_XOR_DELTA: u32 = 0x02;
pub fn new() -> Self {
Self {
version: 2,
triple_count: 0,
subject_count: 0,
predicate_count: 0,
object_count: 0,
quoted_count: 0,
compression_flags: Self::FLAG_FRONT_CODING | Self::FLAG_XOR_DELTA,
}
}
pub fn has_front_coding(&self) -> bool {
self.compression_flags & Self::FLAG_FRONT_CODING != 0
}
pub fn has_xor_delta(&self) -> bool {
self.compression_flags & Self::FLAG_XOR_DELTA != 0
}
}
impl Default for HdtV2Header {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct FrontCodedDictionary {
entries: Vec<String>,
str_to_id: HashMap<String, u32>,
blocks: Vec<FcBlock>,
encoded: bool,
}
#[derive(Debug, Clone, Default)]
struct FcBlock {
header: String,
suffixes: Vec<(usize, String)>,
}
impl FrontCodedDictionary {
pub fn new() -> Self {
Self::default()
}
pub fn insert(&mut self, s: &str) -> u32 {
if let Some(&id) = self.str_to_id.get(s) {
return id;
}
self.entries.push(s.to_string());
let id = self.entries.len() as u32; self.str_to_id.insert(s.to_string(), id);
self.encoded = false;
id
}
pub fn get_id(&self, s: &str) -> Option<u32> {
self.str_to_id.get(s).copied()
}
pub fn decode(&self, id: u32) -> Option<&str> {
if id == 0 || id as usize > self.entries.len() {
return None;
}
Some(&self.entries[(id - 1) as usize])
}
pub fn encode(&mut self) {
self.entries.sort_unstable();
self.str_to_id.clear();
for (i, s) in self.entries.iter().enumerate() {
self.str_to_id.insert(s.clone(), (i + 1) as u32);
}
self.blocks.clear();
let mut i = 0;
while i < self.entries.len() {
let block_start = i;
let header = self.entries[i].clone();
let mut suffixes = Vec::new();
i += 1;
while i < self.entries.len() && i < block_start + FC_BLOCK_SIZE {
let prev = &self.entries[i - 1];
let curr = &self.entries[i];
let common = common_prefix_len(prev, curr);
let suffix = curr[common..].to_string();
suffixes.push((common, suffix));
i += 1;
}
self.blocks.push(FcBlock { header, suffixes });
}
self.encoded = true;
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn write<W: Write>(&self, writer: &mut W) -> StarResult<()> {
write_u32(writer, self.entries.len() as u32)?;
for entry in &self.entries {
write_string(writer, entry)?;
}
Ok(())
}
pub fn read<R: Read>(reader: &mut R) -> StarResult<Self> {
let count = read_u32(reader)?;
let mut dict = FrontCodedDictionary::new();
for _ in 0..count {
let s = read_string(reader)?;
dict.insert(&s);
}
Ok(dict)
}
pub fn compressed_size_estimate(&self) -> usize {
if !self.encoded || self.blocks.is_empty() {
return self.entries.iter().map(|e| e.len() + 4).sum::<usize>();
}
let mut total = 0;
for block in &self.blocks {
total += block.header.len() + 4; for (_plen, suffix) in &block.suffixes {
total += 2 + suffix.len(); }
}
total
}
}
fn common_prefix_len(a: &str, b: &str) -> usize {
a.chars().zip(b.chars()).take_while(|(x, y)| x == y).count()
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct BitmapRow {
pub bits: Vec<u64>,
}
impl BitmapRow {
pub fn from_positions(positions: &[usize]) -> Self {
let max_pos = positions.iter().copied().max().unwrap_or(0);
let len = (max_pos / 64) + 1;
let mut bits = vec![0u64; len];
for &pos in positions {
bits[pos / 64] |= 1u64 << (pos % 64);
}
Self { bits }
}
pub fn xor(&self, other: &BitmapRow) -> BitmapRow {
let len = self.bits.len().max(other.bits.len());
let mut result = vec![0u64; len];
for (i, elem) in result.iter_mut().enumerate() {
let a = self.bits.get(i).copied().unwrap_or(0);
let b = other.bits.get(i).copied().unwrap_or(0);
*elem = a ^ b;
}
BitmapRow { bits: result }
}
pub fn apply_xor(&self, delta: &BitmapRow) -> BitmapRow {
self.xor(delta)
}
pub fn get(&self, pos: usize) -> bool {
let word = pos / 64;
let bit = pos % 64;
if word >= self.bits.len() {
return false;
}
self.bits[word] & (1u64 << bit) != 0
}
pub fn set(&mut self, pos: usize) {
let word = pos / 64;
let bit = pos % 64;
if word >= self.bits.len() {
self.bits.resize(word + 1, 0);
}
self.bits[word] |= 1u64 << bit;
}
pub fn popcount(&self) -> u64 {
self.bits.iter().map(|w| w.count_ones() as u64).sum()
}
pub fn write<W: Write>(&self, writer: &mut W) -> StarResult<()> {
write_u32(writer, self.bits.len() as u32)?;
for &word in &self.bits {
let bytes = word.to_le_bytes();
writer
.write_all(&bytes)
.map_err(|e| StarError::processing_error(format!("Write error: {e}")))?;
}
Ok(())
}
pub fn read<R: Read>(reader: &mut R) -> StarResult<Self> {
let len = read_u32(reader)? as usize;
let mut bits = vec![0u64; len];
for word in &mut bits {
let mut buf = [0u8; 8];
reader
.read_exact(&mut buf)
.map_err(|e| StarError::processing_error(format!("Read error: {e}")))?;
*word = u64::from_le_bytes(buf);
}
Ok(Self { bits })
}
}
#[derive(Debug, Default)]
pub struct XorDeltaBitmapIndex {
base: Option<BitmapRow>,
deltas: Vec<BitmapRow>,
row_count: usize,
}
impl XorDeltaBitmapIndex {
pub fn new() -> Self {
Self::default()
}
pub fn append(&mut self, row: BitmapRow) {
if self.base.is_none() {
self.base = Some(row);
} else {
let prev = self.current_row(self.row_count - 1);
let delta = prev.xor(&row);
self.deltas.push(delta);
}
self.row_count += 1;
}
pub fn get_row(&self, idx: usize) -> StarResult<BitmapRow> {
if idx >= self.row_count {
return Err(StarError::processing_error(format!(
"Row index {idx} out of range ({})",
self.row_count
)));
}
Ok(self.current_row(idx))
}
fn current_row(&self, idx: usize) -> BitmapRow {
let base = match &self.base {
Some(b) => b.clone(),
None => return BitmapRow::default(),
};
if idx == 0 {
return base;
}
let mut current = base;
for i in 0..idx {
if i < self.deltas.len() {
current = current.apply_xor(&self.deltas[i]);
}
}
current
}
pub fn row_count(&self) -> usize {
self.row_count
}
pub fn write<W: Write>(&self, writer: &mut W) -> StarResult<()> {
write_u32(writer, self.row_count as u32)?;
if let Some(base) = &self.base {
base.write(writer)?;
} else {
write_u32(writer, 0)?; }
write_u32(writer, self.deltas.len() as u32)?;
for delta in &self.deltas {
delta.write(writer)?;
}
Ok(())
}
pub fn read<R: Read>(reader: &mut R) -> StarResult<Self> {
let row_count = read_u32(reader)? as usize;
let base = if row_count > 0 {
Some(BitmapRow::read(reader)?)
} else {
let _len = read_u32(reader)?;
None
};
let delta_count = read_u32(reader)? as usize;
let mut deltas = Vec::with_capacity(delta_count);
for _ in 0..delta_count {
deltas.push(BitmapRow::read(reader)?);
}
Ok(Self {
base,
deltas,
row_count,
})
}
pub fn compression_ratio(&self) -> f64 {
let raw_words: u64 = self.base.as_ref().map(|b| b.bits.len() as u64).unwrap_or(0)
* (self.row_count as u64 + 1);
let compressed_words: u64 = self.base.as_ref().map(|b| b.bits.len() as u64).unwrap_or(0)
+ self.deltas.iter().map(|d| d.bits.len() as u64).sum::<u64>();
if compressed_words == 0 {
return 1.0;
}
raw_words as f64 / compressed_words as f64
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct EncodedQuotedTriple {
pub subject_id: u32,
pub predicate_id: u32,
pub object_id: u32,
pub quoted_subject_id: Option<u32>,
}
#[derive(Debug, Default)]
pub struct QuotedTripleDictionaryV2 {
entries: Vec<EncodedQuotedTriple>,
id_map: HashMap<EncodedQuotedTriple, u32>,
}
impl QuotedTripleDictionaryV2 {
pub fn new() -> Self {
Self::default()
}
pub fn insert(&mut self, enc: EncodedQuotedTriple) -> u32 {
if let Some(&id) = self.id_map.get(&enc) {
return id;
}
self.entries.push(enc.clone());
let id = self.entries.len() as u32;
self.id_map.insert(enc, id);
id
}
pub fn get_id(&self, enc: &EncodedQuotedTriple) -> Option<u32> {
self.id_map.get(enc).copied()
}
pub fn decode(&self, id: u32) -> Option<&EncodedQuotedTriple> {
if id == 0 || id as usize > self.entries.len() {
return None;
}
Some(&self.entries[(id - 1) as usize])
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}
pub struct HdtStarV2Builder {
subject_dict: FrontCodedDictionary,
predicate_dict: FrontCodedDictionary,
object_dict: FrontCodedDictionary,
quoted_dict: QuotedTripleDictionaryV2,
encoded_triples: Vec<(u32, u32, u32)>,
#[allow(dead_code)]
profiler: Profiler,
}
impl HdtStarV2Builder {
pub fn new() -> Self {
Self {
subject_dict: FrontCodedDictionary::new(),
predicate_dict: FrontCodedDictionary::new(),
object_dict: FrontCodedDictionary::new(),
quoted_dict: QuotedTripleDictionaryV2::new(),
encoded_triples: Vec::new(),
profiler: Profiler::new(),
}
}
pub fn add_triple(&mut self, triple: &StarTriple) -> StarResult<()> {
let s_id = self.encode_term_as_subject(&triple.subject)?;
let p_id = self.encode_term_as_predicate(&triple.predicate)?;
let o_id = self.encode_term_as_object(&triple.object)?;
self.encoded_triples.push((s_id, p_id, o_id));
Ok(())
}
fn encode_term_as_subject(&mut self, term: &StarTerm) -> StarResult<u32> {
match term {
StarTerm::QuotedTriple(qt) => {
let s = self.encode_term_as_subject(&qt.subject)?;
let p = self.encode_term_as_predicate(&qt.predicate)?;
let o = self.encode_term_as_object(&qt.object)?;
let enc = EncodedQuotedTriple {
subject_id: s,
predicate_id: p,
object_id: o,
quoted_subject_id: None,
};
Ok(self.quoted_dict.insert(enc))
}
_ => {
let key = term_string(term);
Ok(self.subject_dict.insert(&key))
}
}
}
fn encode_term_as_predicate(&mut self, term: &StarTerm) -> StarResult<u32> {
let key = term_string(term);
Ok(self.predicate_dict.insert(&key))
}
fn encode_term_as_object(&mut self, term: &StarTerm) -> StarResult<u32> {
match term {
StarTerm::QuotedTriple(qt) => {
let s = self.encode_term_as_subject(&qt.subject)?;
let p = self.encode_term_as_predicate(&qt.predicate)?;
let o = self.encode_term_as_object(&qt.object)?;
let enc = EncodedQuotedTriple {
subject_id: s,
predicate_id: p,
object_id: o,
quoted_subject_id: None,
};
Ok(self.quoted_dict.insert(enc))
}
_ => {
let key = term_string(term);
Ok(self.object_dict.insert(&key))
}
}
}
pub fn statistics(&self) -> HdtV2Statistics {
HdtV2Statistics {
triple_count: self.encoded_triples.len(),
subject_count: self.subject_dict.len(),
predicate_count: self.predicate_dict.len(),
object_count: self.object_dict.len(),
quoted_count: self.quoted_dict.len(),
}
}
pub fn write<W: Write>(&mut self, writer: &mut W) -> StarResult<()> {
self.subject_dict.encode();
self.predicate_dict.encode();
self.object_dict.encode();
let spo_index = self.build_spo_index();
writer
.write_all(&HDTV2_MAGIC)
.map_err(|e| StarError::processing_error(format!("Write error: {e}")))?;
let header = HdtV2Header {
version: 2,
triple_count: self.encoded_triples.len() as u64,
subject_count: self.subject_dict.len() as u64,
predicate_count: self.predicate_dict.len() as u64,
object_count: self.object_dict.len() as u64,
quoted_count: self.quoted_dict.len() as u64,
compression_flags: HdtV2Header::FLAG_FRONT_CODING | HdtV2Header::FLAG_XOR_DELTA,
};
let header_bytes = serde_json::to_vec(&header)
.map_err(|e| StarError::processing_error(format!("Header serialization error: {e}")))?;
write_u32(writer, header_bytes.len() as u32)?;
writer
.write_all(&header_bytes)
.map_err(|e| StarError::processing_error(format!("Write error: {e}")))?;
self.subject_dict.write(writer)?;
self.predicate_dict.write(writer)?;
self.object_dict.write(writer)?;
write_u32(writer, self.quoted_dict.len() as u32)?;
for entry in &self.quoted_dict.entries {
write_u32(writer, entry.subject_id)?;
write_u32(writer, entry.predicate_id)?;
write_u32(writer, entry.object_id)?;
}
spo_index.write(writer)?;
info!(
"HDT-star v2 written: {} triples, {} subjects, {} predicates, {} objects, {} quoted",
header.triple_count,
header.subject_count,
header.predicate_count,
header.object_count,
header.quoted_count
);
Ok(())
}
fn build_spo_index(&self) -> XorDeltaBitmapIndex {
let mut index = XorDeltaBitmapIndex::new();
let mut by_subject: HashMap<u32, Vec<usize>> = HashMap::new();
for (i, &(s, _p, _o)) in self.encoded_triples.iter().enumerate() {
by_subject.entry(s).or_default().push(i);
}
let mut subjects: Vec<u32> = by_subject.keys().copied().collect();
subjects.sort_unstable();
for s in subjects {
let positions = &by_subject[&s];
let row = BitmapRow::from_positions(positions);
index.append(row);
}
index
}
}
impl Default for HdtStarV2Builder {
fn default() -> Self {
Self::new()
}
}
fn term_string(term: &StarTerm) -> String {
match term {
StarTerm::NamedNode(n) => format!("<{}>", n.iri),
StarTerm::BlankNode(b) => format!("_:{}", b.id),
StarTerm::Literal(l) => {
let lang = l.language.as_deref().unwrap_or("");
let dt = l
.datatype
.as_ref()
.map(|d| d.iri.as_str())
.unwrap_or("xsd:string");
format!("\"{}\"@{}^^{}", l.value, lang, dt)
}
StarTerm::QuotedTriple(qt) => format!(
"<<{}|{}|{}>>",
term_string(&qt.subject),
term_string(&qt.predicate),
term_string(&qt.object)
),
StarTerm::Variable(v) => format!("?{}", v.name),
}
}
pub struct HdtStarV2Reader {
pub header: HdtV2Header,
subject_dict: FrontCodedDictionary,
predicate_dict: FrontCodedDictionary,
object_dict: FrontCodedDictionary,
quoted_dict: QuotedTripleDictionaryV2,
spo_index: XorDeltaBitmapIndex,
}
impl HdtStarV2Reader {
pub fn read<R: Read>(reader: &mut R) -> StarResult<Self> {
let mut magic = [0u8; 8];
reader
.read_exact(&mut magic)
.map_err(|e| StarError::processing_error(format!("Read error: {e}")))?;
if magic != HDTV2_MAGIC {
return Err(StarError::processing_error(
"Invalid HDT-star v2 magic bytes",
));
}
let header_len = read_u32(reader)? as usize;
let mut header_bytes = vec![0u8; header_len];
reader
.read_exact(&mut header_bytes)
.map_err(|e| StarError::processing_error(format!("Read error: {e}")))?;
let header: HdtV2Header = serde_json::from_slice(&header_bytes).map_err(|e| {
StarError::processing_error(format!("Header deserialization error: {e}"))
})?;
let subject_dict = FrontCodedDictionary::read(reader)?;
let predicate_dict = FrontCodedDictionary::read(reader)?;
let object_dict = FrontCodedDictionary::read(reader)?;
let quoted_count = read_u32(reader)? as usize;
let mut quoted_dict = QuotedTripleDictionaryV2::new();
for _ in 0..quoted_count {
let s = read_u32(reader)?;
let p = read_u32(reader)?;
let o = read_u32(reader)?;
quoted_dict.insert(EncodedQuotedTriple {
subject_id: s,
predicate_id: p,
object_id: o,
quoted_subject_id: None,
});
}
let spo_index = XorDeltaBitmapIndex::read(reader)?;
Ok(Self {
header,
subject_dict,
predicate_dict,
object_dict,
quoted_dict,
spo_index,
})
}
pub fn statistics(&self) -> HdtV2Statistics {
HdtV2Statistics {
triple_count: self.header.triple_count as usize,
subject_count: self.subject_dict.len(),
predicate_count: self.predicate_dict.len(),
object_count: self.object_dict.len(),
quoted_count: self.quoted_dict.len(),
}
}
pub fn spo_row_count(&self) -> usize {
self.spo_index.row_count()
}
pub fn decode_subject(&self, id: u32) -> Option<&str> {
self.subject_dict.decode(id)
}
pub fn decode_predicate(&self, id: u32) -> Option<&str> {
self.predicate_dict.decode(id)
}
pub fn decode_object(&self, id: u32) -> Option<&str> {
self.object_dict.decode(id)
}
}
#[derive(Debug, Clone)]
pub struct HdtV2Statistics {
pub triple_count: usize,
pub subject_count: usize,
pub predicate_count: usize,
pub object_count: usize,
pub quoted_count: usize,
}
fn write_u32<W: Write>(writer: &mut W, v: u32) -> StarResult<()> {
writer
.write_all(&v.to_le_bytes())
.map_err(|e| StarError::processing_error(format!("Write error: {e}")))
}
fn read_u32<R: Read>(reader: &mut R) -> StarResult<u32> {
let mut buf = [0u8; 4];
reader
.read_exact(&mut buf)
.map_err(|e| StarError::processing_error(format!("Read error: {e}")))?;
Ok(u32::from_le_bytes(buf))
}
fn write_string<W: Write>(writer: &mut W, s: &str) -> StarResult<()> {
let bytes = s.as_bytes();
write_u32(writer, bytes.len() as u32)?;
writer
.write_all(bytes)
.map_err(|e| StarError::processing_error(format!("Write error: {e}")))
}
fn read_string<R: Read>(reader: &mut R) -> StarResult<String> {
let len = read_u32(reader)? as usize;
let mut buf = vec![0u8; len];
reader
.read_exact(&mut buf)
.map_err(|e| StarError::processing_error(format!("Read error: {e}")))?;
String::from_utf8(buf).map_err(|e| StarError::processing_error(format!("UTF-8 error: {e}")))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{StarTerm, StarTriple};
fn make_triple(s: &str, p: &str, o: &str) -> StarTriple {
StarTriple::new(
StarTerm::iri(s).unwrap(),
StarTerm::iri(p).unwrap(),
StarTerm::iri(o).unwrap(),
)
}
fn make_quoted_triple(
qs: &str,
qp: &str,
qo: &str,
meta_pred: &str,
meta_obj: &str,
) -> StarTriple {
let inner = make_triple(qs, qp, qo);
StarTriple::new(
StarTerm::QuotedTriple(Box::new(inner)),
StarTerm::iri(meta_pred).unwrap(),
StarTerm::iri(meta_obj).unwrap(),
)
}
#[test]
fn test_fcd_insert_and_lookup() {
let mut dict = FrontCodedDictionary::new();
let id1 = dict.insert("http://example.org/alice");
let id2 = dict.insert("http://example.org/bob");
let id3 = dict.insert("http://example.org/alice"); assert_ne!(id1, id2);
assert_eq!(id1, id3, "Duplicate insert should return same ID");
}
#[test]
fn test_fcd_decode() {
let mut dict = FrontCodedDictionary::new();
let id = dict.insert("http://example.org/test");
assert_eq!(dict.decode(id), Some("http://example.org/test"));
assert_eq!(dict.decode(0), None);
assert_eq!(dict.decode(9999), None);
}
#[test]
fn test_fcd_encode_sorts() {
let mut dict = FrontCodedDictionary::new();
dict.insert("http://z.org/z");
dict.insert("http://a.org/a");
dict.insert("http://m.org/m");
dict.encode();
assert_eq!(dict.entries[0], "http://a.org/a");
assert_eq!(dict.entries[1], "http://m.org/m");
assert_eq!(dict.entries[2], "http://z.org/z");
}
#[test]
fn test_fcd_compressed_size_smaller_than_raw() {
let mut dict = FrontCodedDictionary::new();
for i in 0..100 {
dict.insert(&format!("http://example.org/resource/{i}"));
}
dict.encode();
let compressed = dict.compressed_size_estimate();
let raw: usize = dict.entries.iter().map(|e| e.len() + 4).sum();
assert!(
compressed <= raw,
"Compressed ({compressed}) should be <= raw ({raw})"
);
}
#[test]
fn test_fcd_roundtrip() {
let mut dict = FrontCodedDictionary::new();
for i in 0..20 {
dict.insert(&format!("http://example.org/term{i}"));
}
let mut buf = Vec::new();
dict.write(&mut buf).unwrap();
let restored = FrontCodedDictionary::read(&mut buf.as_slice()).unwrap();
assert_eq!(restored.len(), dict.len());
for (i, s) in dict.entries.iter().enumerate() {
let id = restored.get_id(s);
assert!(id.is_some(), "Term {i} ({s}) not found after roundtrip");
}
}
#[test]
fn test_fcd_empty_dict() {
let dict = FrontCodedDictionary::new();
assert_eq!(dict.len(), 0);
assert!(dict.is_empty());
}
#[test]
fn test_bitmap_row_set_and_get() {
let mut row = BitmapRow::default();
row.set(0);
row.set(63);
row.set(64);
assert!(row.get(0));
assert!(row.get(63));
assert!(row.get(64));
assert!(!row.get(1));
assert!(!row.get(65));
}
#[test]
fn test_bitmap_row_from_positions() {
let row = BitmapRow::from_positions(&[0, 5, 10, 63, 128]);
assert!(row.get(0));
assert!(row.get(5));
assert!(row.get(10));
assert!(row.get(63));
assert!(row.get(128));
assert!(!row.get(1));
}
#[test]
fn test_bitmap_row_xor_self_zero() {
let row = BitmapRow::from_positions(&[1, 3, 5, 7]);
let delta = row.xor(&row);
assert_eq!(delta.popcount(), 0, "XOR with self should be zero");
}
#[test]
fn test_bitmap_row_xor_roundtrip() {
let a = BitmapRow::from_positions(&[1, 5, 10]);
let b = BitmapRow::from_positions(&[5, 10, 15]);
let delta = a.xor(&b);
let recovered = a.apply_xor(&delta);
assert_eq!(recovered, b, "XOR roundtrip should recover original");
}
#[test]
fn test_bitmap_row_popcount() {
let row = BitmapRow::from_positions(&[0, 1, 2, 3, 4]);
assert_eq!(row.popcount(), 5);
}
#[test]
fn test_bitmap_row_roundtrip() {
let row = BitmapRow::from_positions(&[0, 64, 128, 192]);
let mut buf = Vec::new();
row.write(&mut buf).unwrap();
let restored = BitmapRow::read(&mut buf.as_slice()).unwrap();
assert_eq!(row, restored);
}
#[test]
fn test_xor_delta_index_single_row() {
let mut idx = XorDeltaBitmapIndex::new();
let row = BitmapRow::from_positions(&[0, 1, 2]);
idx.append(row.clone());
assert_eq!(idx.row_count(), 1);
let retrieved = idx.get_row(0).unwrap();
assert_eq!(retrieved, row);
}
#[test]
fn test_xor_delta_index_multiple_rows() {
let mut idx = XorDeltaBitmapIndex::new();
let rows: Vec<BitmapRow> = (0..5usize)
.map(|i| BitmapRow::from_positions(&[i, i + 5, i + 10]))
.collect();
for r in &rows {
idx.append(r.clone());
}
assert_eq!(idx.row_count(), 5);
for (i, expected) in rows.iter().enumerate() {
let got = idx.get_row(i).unwrap();
assert_eq!(&got, expected, "Row {i} mismatch after XOR delta");
}
}
#[test]
fn test_xor_delta_index_out_of_range() {
let idx = XorDeltaBitmapIndex::new();
let result = idx.get_row(0);
assert!(result.is_err());
}
#[test]
fn test_xor_delta_index_roundtrip() {
let mut idx = XorDeltaBitmapIndex::new();
for i in 0..10usize {
idx.append(BitmapRow::from_positions(&[i, i + 100]));
}
let mut buf = Vec::new();
idx.write(&mut buf).unwrap();
let restored = XorDeltaBitmapIndex::read(&mut buf.as_slice()).unwrap();
assert_eq!(restored.row_count(), 10);
for i in 0..10 {
let expected = idx.get_row(i).unwrap();
let got = restored.get_row(i).unwrap();
assert_eq!(got, expected, "Roundtrip mismatch at row {i}");
}
}
#[test]
fn test_xor_delta_compression_ratio() {
let mut idx = XorDeltaBitmapIndex::new();
let base = BitmapRow::from_positions(&(0..100usize).collect::<Vec<_>>());
for _ in 0..10 {
idx.append(base.clone());
}
let ratio = idx.compression_ratio();
assert!(
ratio > 1.0,
"Compression ratio should be > 1.0 for identical rows, got {ratio}"
);
}
#[test]
fn test_builder_add_and_write_basic() {
let mut builder = HdtStarV2Builder::new();
for i in 0..10 {
builder
.add_triple(&make_triple(
&format!("http://ex.org/s{i}"),
"http://ex.org/p",
&format!("http://ex.org/o{i}"),
))
.unwrap();
}
let mut buf = Vec::new();
builder.write(&mut buf).unwrap();
assert!(!buf.is_empty());
}
#[test]
fn test_builder_statistics() {
let mut builder = HdtStarV2Builder::new();
for i in 0..5 {
builder
.add_triple(&make_triple(
&format!("http://ex.org/s{i}"),
"http://ex.org/p",
&format!("http://ex.org/o{i}"),
))
.unwrap();
}
let stats = builder.statistics();
assert_eq!(stats.triple_count, 5);
assert!(stats.subject_count >= 1);
assert!(stats.predicate_count >= 1);
assert!(stats.object_count >= 1);
}
#[test]
fn test_roundtrip_basic_triples() {
let mut builder = HdtStarV2Builder::new();
for i in 0..5 {
builder
.add_triple(&make_triple(
&format!("http://ex.org/s{i}"),
"http://ex.org/p",
&format!("http://ex.org/o{i}"),
))
.unwrap();
}
let mut buf = Vec::new();
builder.write(&mut buf).unwrap();
let reader = HdtStarV2Reader::read(&mut buf.as_slice()).unwrap();
let stats = reader.statistics();
assert_eq!(stats.triple_count, 5);
assert!(stats.subject_count >= 1);
}
#[test]
fn test_roundtrip_quoted_triples() {
let mut builder = HdtStarV2Builder::new();
builder
.add_triple(&make_quoted_triple(
"http://ex.org/a",
"http://ex.org/b",
"http://ex.org/c",
"http://ex.org/certainty",
"http://ex.org/high",
))
.unwrap();
let stats = builder.statistics();
assert_eq!(stats.quoted_count, 1);
let mut buf = Vec::new();
builder.write(&mut buf).unwrap();
let reader = HdtStarV2Reader::read(&mut buf.as_slice()).unwrap();
assert_eq!(reader.statistics().quoted_count, 1);
}
#[test]
fn test_magic_bytes_verified() {
let mut builder = HdtStarV2Builder::new();
builder
.add_triple(&make_triple("http://a", "http://b", "http://c"))
.unwrap();
let mut buf = Vec::new();
builder.write(&mut buf).unwrap();
buf[0] ^= 0xFF; let result = HdtStarV2Reader::read(&mut buf.as_slice());
assert!(result.is_err(), "Corrupted magic should fail");
}
#[test]
fn test_header_flags() {
let h = HdtV2Header::new();
assert!(h.has_front_coding());
assert!(h.has_xor_delta());
}
#[test]
fn test_quoted_triple_dictionary_insert_dedup() {
let mut qd = QuotedTripleDictionaryV2::new();
let enc = EncodedQuotedTriple {
subject_id: 1,
predicate_id: 2,
object_id: 3,
quoted_subject_id: None,
};
let id1 = qd.insert(enc.clone());
let id2 = qd.insert(enc.clone());
assert_eq!(id1, id2);
assert_eq!(qd.len(), 1);
}
#[test]
fn test_large_dataset_compression() {
let mut builder = HdtStarV2Builder::new();
for i in 0..1000 {
builder
.add_triple(&make_triple(
&format!("http://example.org/subject/{i}"),
"http://example.org/predicate/knows",
&format!("http://example.org/object/{}", i % 100),
))
.unwrap();
}
let mut buf = Vec::new();
builder.write(&mut buf).unwrap();
let reader = HdtStarV2Reader::read(&mut buf.as_slice()).unwrap();
let stats = reader.statistics();
assert_eq!(stats.triple_count, 1000);
assert_eq!(stats.predicate_count, 1);
assert_eq!(stats.object_count, 100);
}
#[test]
fn test_spo_index_row_count() {
let mut builder = HdtStarV2Builder::new();
for s in 0..5 {
for o in 0..3 {
builder
.add_triple(&make_triple(
&format!("http://ex.org/s{s}"),
"http://ex.org/p",
&format!("http://ex.org/o{o}"),
))
.unwrap();
}
}
let mut buf = Vec::new();
builder.write(&mut buf).unwrap();
let reader = HdtStarV2Reader::read(&mut buf.as_slice()).unwrap();
assert_eq!(
reader.spo_row_count(),
5,
"SPO index should have 5 rows (one per subject)"
);
}
}