use nodedb_types::Surrogate;
use crate::codec::{bitpack, delta, smallfloat};
pub const BLOCK_SIZE: usize = 128;
#[derive(Debug, Clone)]
pub struct CompactPosting {
pub doc_id: Surrogate,
pub term_freq: u32,
pub fieldnorm: u8,
pub positions: Vec<u32>,
}
#[derive(Debug, Clone)]
pub struct PostingBlock {
pub doc_ids: Vec<Surrogate>,
pub term_freqs: Vec<u32>,
pub fieldnorms: Vec<u8>,
pub positions: Vec<Vec<u32>>,
}
#[derive(Debug, Clone, Copy)]
pub struct BlockMeta {
pub last_doc_id: Surrogate,
pub doc_count: u16,
pub block_max_tf: u32,
pub block_min_fieldnorm: u8,
}
impl PostingBlock {
pub fn from_postings(postings: &[CompactPosting]) -> Self {
let n = postings.len().min(BLOCK_SIZE);
debug_assert!(
postings[..n].windows(2).all(|w| w[0].doc_id <= w[1].doc_id),
"postings must be sorted by surrogate"
);
Self {
doc_ids: postings[..n].iter().map(|p| p.doc_id).collect(),
term_freqs: postings[..n].iter().map(|p| p.term_freq).collect(),
fieldnorms: postings[..n].iter().map(|p| p.fieldnorm).collect(),
positions: postings[..n].iter().map(|p| p.positions.clone()).collect(),
}
}
pub fn len(&self) -> usize {
self.doc_ids.len()
}
pub fn is_empty(&self) -> bool {
self.doc_ids.is_empty()
}
pub fn meta(&self) -> BlockMeta {
BlockMeta {
last_doc_id: self.doc_ids.last().copied().unwrap_or(Surrogate::ZERO),
doc_count: self.doc_ids.len() as u16,
block_max_tf: self.term_freqs.iter().copied().max().unwrap_or(0),
block_min_fieldnorm: self.fieldnorms.iter().copied().min().unwrap_or(0),
}
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut buf = Vec::new();
buf.extend_from_slice(&(self.doc_ids.len() as u16).to_le_bytes());
let raw_ids: Vec<u32> = self.doc_ids.iter().map(|s| s.0).collect();
let deltas = delta::encode(&raw_ids);
let packed_ids = bitpack::pack(&deltas);
buf.extend_from_slice(&(packed_ids.len() as u32).to_le_bytes());
buf.extend_from_slice(&packed_ids);
let packed_freqs = bitpack::pack(&self.term_freqs);
buf.extend_from_slice(&(packed_freqs.len() as u32).to_le_bytes());
buf.extend_from_slice(&packed_freqs);
buf.extend_from_slice(&self.fieldnorms);
for positions in &self.positions {
buf.extend_from_slice(&(positions.len() as u16).to_le_bytes());
if !positions.is_empty() {
let packed_pos = bitpack::pack(positions);
buf.extend_from_slice(&(packed_pos.len() as u16).to_le_bytes());
buf.extend_from_slice(&packed_pos);
} else {
buf.extend_from_slice(&0u16.to_le_bytes());
}
}
buf
}
pub fn from_bytes(buf: &[u8]) -> Option<Self> {
let mut pos = 0;
if buf.len() < 2 {
return None;
}
let doc_count = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
pos += 2;
if doc_count == 0 {
return Some(Self {
doc_ids: Vec::new(),
term_freqs: Vec::new(),
fieldnorms: Vec::new(),
positions: Vec::new(),
});
}
if pos + 4 > buf.len() {
return None;
}
let ids_len =
u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) as usize;
pos += 4;
if pos + ids_len > buf.len() {
return None;
}
let packed_deltas = &buf[pos..pos + ids_len];
let deltas = bitpack::unpack(packed_deltas);
let raw_ids = delta::decode(&deltas);
let doc_ids: Vec<Surrogate> = raw_ids.into_iter().map(Surrogate).collect();
pos += ids_len;
if pos + 4 > buf.len() {
return None;
}
let freqs_len =
u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) as usize;
pos += 4;
if pos + freqs_len > buf.len() {
return None;
}
let term_freqs = bitpack::unpack(&buf[pos..pos + freqs_len]);
pos += freqs_len;
if pos + doc_count > buf.len() {
return None;
}
let fieldnorms = buf[pos..pos + doc_count].to_vec();
pos += doc_count;
let mut positions = Vec::with_capacity(doc_count);
for _ in 0..doc_count {
if pos + 2 > buf.len() {
return None;
}
let num_positions = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
pos += 2;
if num_positions == 0 {
if pos + 2 > buf.len() {
return None;
}
pos += 2; positions.push(Vec::new());
} else {
if pos + 2 > buf.len() {
return None;
}
let packed_pos_len = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
pos += 2;
if pos + packed_pos_len > buf.len() {
return None;
}
let doc_positions = bitpack::unpack(&buf[pos..pos + packed_pos_len]);
pos += packed_pos_len;
positions.push(doc_positions);
}
}
if doc_ids.len() != doc_count || term_freqs.len() != doc_count {
return None;
}
Some(Self {
doc_ids,
term_freqs,
fieldnorms,
positions,
})
}
}
pub fn into_blocks(mut postings: Vec<CompactPosting>) -> Vec<PostingBlock> {
postings.sort_by_key(|p| p.doc_id);
postings
.chunks(BLOCK_SIZE)
.map(PostingBlock::from_postings)
.collect()
}
pub fn encode_fieldnorm(doc_length: u32) -> u8 {
smallfloat::encode(doc_length)
}
pub fn decode_fieldnorm(byte: u8) -> u32 {
smallfloat::decode(byte)
}
#[cfg(test)]
mod tests {
use super::*;
fn make_postings(n: usize) -> Vec<CompactPosting> {
(0..n)
.map(|i| CompactPosting {
doc_id: Surrogate((i * 3) as u32),
term_freq: (i % 5 + 1) as u32,
fieldnorm: smallfloat::encode((i * 10 + 20) as u32),
positions: vec![i as u32, (i + 5) as u32],
})
.collect()
}
#[test]
fn block_roundtrip() {
let postings = make_postings(50);
let block = PostingBlock::from_postings(&postings);
assert_eq!(block.len(), 50);
let bytes = block.to_bytes();
let restored = PostingBlock::from_bytes(&bytes).unwrap();
assert_eq!(restored.doc_ids, block.doc_ids);
assert_eq!(restored.term_freqs, block.term_freqs);
assert_eq!(restored.fieldnorms, block.fieldnorms);
assert_eq!(restored.positions, block.positions);
}
#[test]
fn block_128_roundtrip() {
let postings = make_postings(128);
let block = PostingBlock::from_postings(&postings);
assert_eq!(block.len(), 128);
let bytes = block.to_bytes();
let restored = PostingBlock::from_bytes(&bytes).unwrap();
assert_eq!(restored.doc_ids, block.doc_ids);
}
#[test]
fn block_meta() {
let postings = make_postings(10);
let block = PostingBlock::from_postings(&postings);
let meta = block.meta();
assert_eq!(meta.doc_count, 10);
assert_eq!(meta.last_doc_id, Surrogate(27)); assert_eq!(meta.block_max_tf, 5); }
#[test]
fn into_blocks_splits() {
let postings = make_postings(300);
let blocks = into_blocks(postings);
assert_eq!(blocks.len(), 3); assert_eq!(blocks[0].len(), 128);
assert_eq!(blocks[1].len(), 128);
assert_eq!(blocks[2].len(), 44);
}
#[test]
fn empty_block() {
let block = PostingBlock::from_postings(&[]);
assert!(block.is_empty());
let bytes = block.to_bytes();
let restored = PostingBlock::from_bytes(&bytes).unwrap();
assert!(restored.is_empty());
}
#[test]
fn fieldnorm_encode_decode() {
for len in [0, 1, 5, 8, 10, 20, 50, 100, 500, 1000, 10_000] {
let encoded = encode_fieldnorm(len);
let decoded = decode_fieldnorm(encoded);
if len <= 8 {
assert_eq!(decoded, len, "exact for small: len={len}");
} else {
assert!(decoded <= len, "decoded {decoded} > original {len}");
let error = (len - decoded) as f64 / len as f64;
assert!(
error < 0.25,
"len={len}, decoded={decoded}, error={error:.4}"
);
}
}
}
#[test]
fn compression_ratio() {
let postings: Vec<CompactPosting> = (0..128)
.map(|i| CompactPosting {
doc_id: Surrogate(i),
term_freq: 1,
fieldnorm: smallfloat::encode(100),
positions: vec![0],
})
.collect();
let block = PostingBlock::from_postings(&postings);
let bytes = block.to_bytes();
assert!(
bytes.len() < 1500,
"compressed block should be < 1500 bytes, got {}",
bytes.len()
);
}
}