use crate::{Result, StorageError};
pub fn encode_varint(mut value: u64) -> Vec<u8> {
let mut bytes = Vec::new();
loop {
let mut byte = (value & 0x7F) as u8;
value >>= 7;
if value != 0 {
byte |= 0x80; }
bytes.push(byte);
if value == 0 {
break;
}
}
bytes
}
pub fn decode_varint(bytes: &[u8]) -> Result<(u64, usize)> {
let mut value = 0u64;
let mut shift = 0;
let mut pos = 0;
loop {
if pos >= bytes.len() {
return Err(StorageError::InvalidData("Incomplete varint".into()));
}
let byte = bytes[pos];
pos += 1;
value |= ((byte & 0x7F) as u64) << shift;
shift += 7;
if (byte & 0x80) == 0 {
break;
}
if shift >= 64 {
return Err(StorageError::InvalidData("Varint overflow".into()));
}
}
Ok((value, pos))
}
pub fn encode_posting_list(
doc_ids: &[u64],
positions: Option<&Vec<Vec<u32>>>,
) -> Result<Vec<u8>> {
let mut bytes = Vec::new();
bytes.extend_from_slice(&encode_varint(doc_ids.len() as u64));
let mut prev_doc_id = 0u64;
for &doc_id in doc_ids {
if doc_id < prev_doc_id {
return Err(StorageError::InvalidData(
"Document IDs must be sorted".into()
));
}
let delta = doc_id - prev_doc_id;
bytes.extend_from_slice(&encode_varint(delta));
prev_doc_id = doc_id;
}
if let Some(pos_lists) = positions {
bytes.push(1);
for pos_list in pos_lists {
bytes.extend_from_slice(&encode_varint(pos_list.len() as u64));
let mut prev_pos = 0u32;
for &pos in pos_list {
let delta = pos - prev_pos;
bytes.extend_from_slice(&encode_varint(delta as u64));
prev_pos = pos;
}
}
} else {
bytes.push(0); }
Ok(bytes)
}
#[allow(clippy::type_complexity)]
pub fn decode_posting_list(bytes: &[u8]) -> Result<(Vec<u64>, Option<Vec<Vec<u32>>>)> {
let mut pos = 0;
let (num_docs, consumed) = decode_varint(&bytes[pos..])?;
pos += consumed;
let mut doc_ids = Vec::with_capacity(num_docs as usize);
let mut prev_doc_id = 0u64;
for _ in 0..num_docs {
let (delta, consumed) = decode_varint(&bytes[pos..])?;
pos += consumed;
let doc_id = prev_doc_id + delta;
doc_ids.push(doc_id);
prev_doc_id = doc_id;
}
if pos >= bytes.len() {
return Err(StorageError::InvalidData("Missing positions flag".into()));
}
let has_positions = bytes[pos] != 0;
pos += 1;
let positions = if has_positions {
let mut pos_lists = Vec::with_capacity(num_docs as usize);
for _ in 0..num_docs {
let (num_positions, consumed) = decode_varint(&bytes[pos..])?;
pos += consumed;
let mut position_list = Vec::with_capacity(num_positions as usize);
let mut prev_pos = 0u32;
for _ in 0..num_positions {
let (delta, consumed) = decode_varint(&bytes[pos..])?;
pos += consumed;
let position = prev_pos + delta as u32;
position_list.push(position);
prev_pos = position;
}
pos_lists.push(position_list);
}
Some(pos_lists)
} else {
None
};
Ok((doc_ids, positions))
}
pub const SEGMENT_SIZE: usize = 256;
pub fn encode_segmented_posting_list(
doc_ids: &[u64],
positions: Option<&Vec<Vec<u32>>>,
) -> Result<Vec<(u32, Vec<u8>)>> {
let mut segments = Vec::new();
let mut segment_id = 0u32;
let num_docs = doc_ids.len();
let mut offset = 0;
while offset < num_docs {
let end = (offset + SEGMENT_SIZE).min(num_docs);
let segment_doc_ids = &doc_ids[offset..end];
let segment_positions = positions.as_ref().map(|pos_lists| {
pos_lists[offset..end].to_vec()
});
let encoded = encode_posting_list(
segment_doc_ids,
segment_positions.as_ref(),
)?;
segments.push((segment_id, encoded));
segment_id += 1;
offset = end;
}
Ok(segments)
}
pub fn max_bits(values: &[u32]) -> u8 {
match values.iter().max() {
Some(&max_val) => {
if max_val == 0 { return 1; }
32 - max_val.leading_zeros() as u8
}
None => 0,
}
}
pub fn max_bits_u16(values: &[u16]) -> u8 {
match values.iter().max() {
Some(&max_val) => {
if max_val == 0 { return 1; }
16 - max_val.leading_zeros() as u8
}
None => 0,
}
}
pub fn bitpack_into(buf: &mut Vec<u8>, values: &[u32], bits: u8) {
if bits == 0 || values.is_empty() {
return;
}
let total_bits = values.len() * bits as usize;
let total_bytes = total_bits.div_ceil(8);
let start_len = buf.len();
buf.resize(start_len + total_bytes, 0);
let mut bit_offset = 0u64; for &val in values {
let byte_idx = start_len + (bit_offset / 8) as usize;
let bit_shift = (bit_offset % 8) as u8;
let mask = ((1u32 << bits) - 1) & val;
buf[byte_idx] |= (mask as u8) << bit_shift;
let remaining_bits = 8 - bit_shift;
if bits > remaining_bits {
let overflow = mask >> remaining_bits;
let overflow_bits = bits - remaining_bits;
let add_bytes = overflow_bits.div_ceil(8) as usize;
for i in 0..add_bytes {
if byte_idx + 1 + i < buf.len() {
buf[byte_idx + 1 + i] |= (overflow >> (i * 8)) as u8;
}
}
}
bit_offset += bits as u64;
}
}
pub fn bitpack_u16_into(buf: &mut Vec<u8>, values: &[u16], bits: u8) {
let u32_vals: Vec<u32> = values.iter().map(|&v| v as u32).collect();
bitpack_into(buf, &u32_vals, bits);
}
pub fn bitunpack(data: &[u8], offset: usize, count: usize, bits: u8) -> Vec<u32> {
if bits == 0 || count == 0 {
return vec![0; count];
}
let mut result = Vec::with_capacity(count);
let mask = (1u32 << bits) - 1;
let mut bit_offset = 0u64;
for _ in 0..count {
let byte_idx = offset + (bit_offset / 8) as usize;
let bit_shift = (bit_offset % 8) as u8;
let mut raw_val = 0u64;
let bytes_needed = ((bit_shift + bits) as usize).div_ceil(8);
for i in 0..bytes_needed {
if byte_idx + i < data.len() {
raw_val |= (data[byte_idx + i] as u64) << (i * 8);
}
}
let val = ((raw_val >> bit_shift) as u32) & mask;
result.push(val);
bit_offset += bits as u64;
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_varint_encoding() {
let test_cases = vec![
0u64,
127,
128,
16383,
16384,
u64::MAX,
];
for value in test_cases {
let encoded = encode_varint(value);
let (decoded, consumed) = decode_varint(&encoded).unwrap();
assert_eq!(decoded, value);
assert_eq!(consumed, encoded.len());
}
}
#[test]
fn test_posting_list_no_positions() {
let doc_ids = vec![10, 25, 30, 100, 250];
let encoded = encode_posting_list(&doc_ids, None).unwrap();
let (decoded_ids, decoded_pos) = decode_posting_list(&encoded).unwrap();
assert_eq!(decoded_ids, doc_ids);
assert!(decoded_pos.is_none());
}
#[test]
fn test_posting_list_with_positions() {
let doc_ids = vec![10, 25, 30];
let positions = vec![
vec![0, 5, 10],
vec![2, 8],
vec![1, 3, 7, 15],
];
let encoded = encode_posting_list(&doc_ids, Some(&positions)).unwrap();
let (decoded_ids, decoded_pos) = decode_posting_list(&encoded).unwrap();
assert_eq!(decoded_ids, doc_ids);
assert_eq!(decoded_pos.unwrap(), positions);
}
#[test]
fn test_segmented_posting_list() {
let mut doc_ids = Vec::new();
for i in 0..1000 {
doc_ids.push(i);
}
let segments = encode_segmented_posting_list(&doc_ids, None).unwrap();
assert!(segments.len() > 1);
for (i, (seg_id, _)) in segments.iter().enumerate() {
assert_eq!(*seg_id, i as u32);
}
}
#[test]
fn test_delta_compression_efficiency() {
let doc_ids = vec![100, 101, 102, 103, 104];
let encoded = encode_posting_list(&doc_ids, None).unwrap();
assert!(encoded.len() < 15);
}
#[test]
fn test_max_bits() {
assert_eq!(max_bits(&[]), 0);
assert_eq!(max_bits(&[0]), 1);
assert_eq!(max_bits(&[1]), 1);
assert_eq!(max_bits(&[2]), 2);
assert_eq!(max_bits(&[3]), 2);
assert_eq!(max_bits(&[4]), 3);
assert_eq!(max_bits(&[7]), 3);
assert_eq!(max_bits(&[255]), 8);
assert_eq!(max_bits(&[256]), 9);
assert_eq!(max_bits_u16(&[0, 1, 3]), 2);
}
#[test]
fn test_bitpack_roundtrip_1bit() {
let values = vec![0, 1, 1, 0, 1, 0, 0, 1];
let mut buf = Vec::new();
bitpack_into(&mut buf, &values, 1);
let decoded = bitunpack(&buf, 0, values.len(), 1);
assert_eq!(decoded, values);
}
#[test]
fn test_bitpack_roundtrip_5bit() {
let values: Vec<u32> = (0..128).map(|i| i % 31).collect();
let bits = max_bits(&values);
assert_eq!(bits, 5);
let mut buf = Vec::new();
bitpack_into(&mut buf, &values, bits);
let decoded = bitunpack(&buf, 0, values.len(), bits);
assert_eq!(decoded, values);
}
#[test]
fn test_bitpack_roundtrip_large() {
let values: Vec<u32> = (0..128).map(|i| (i * 8 + 3) % 1024).collect();
let bits = max_bits(&values);
let mut buf = Vec::new();
bitpack_into(&mut buf, &values, bits);
let decoded = bitunpack(&buf, 0, values.len(), bits);
assert_eq!(decoded, values);
assert!(buf.len() < 128 * 4);
}
#[test]
fn test_bitpack_at_offset() {
let values = vec![10, 20, 30, 40];
let bits = max_bits(&values);
let mut buf = vec![0xAA, 0xBB]; bitpack_into(&mut buf, &values, bits);
let decoded = bitunpack(&buf, 2, values.len(), bits);
assert_eq!(decoded, values);
}
}