pub mod roaring_util;
use roaring::RoaringBitmap;
pub const ROARING_THRESHOLD: usize = 8192;
pub fn varint_encode(ids: &[u32]) -> Result<Vec<u8>, &'static str> {
if !ids.windows(2).all(|w| w[0] < w[1]) {
return Err("varint_encode: ids must be strictly ascending (no duplicates)");
}
let mut out = Vec::with_capacity(ids.len() * 2);
let mut prev = 0u32;
for &id in ids {
let delta = id - prev;
write_varint(delta, &mut out);
prev = id;
}
Ok(out)
}
pub fn varint_decode(bytes: &[u8]) -> Result<Vec<u32>, &'static str> {
let mut ids = Vec::new();
let mut pos = 0usize;
let mut prev = 0u32;
while pos < bytes.len() {
let (delta, consumed) = read_varint(&bytes[pos..])?;
if !ids.is_empty() && delta == 0 {
return Err("delta-varint duplicate: zero delta produces non-ascending sequence");
}
prev = prev.checked_add(delta).ok_or("delta-varint overflow")?;
ids.push(prev);
pos += consumed;
}
Ok(ids)
}
#[inline]
fn write_varint(mut value: u32, out: &mut Vec<u8>) {
loop {
let byte = (value & 0x7F) as u8;
value >>= 7;
if value == 0 {
out.push(byte);
break;
} else {
out.push(byte | 0x80);
}
}
}
#[inline]
fn read_varint(bytes: &[u8]) -> Result<(u32, usize), &'static str> {
let mut value = 0u32;
let mut shift = 0u32;
for (i, &byte) in bytes.iter().enumerate() {
if shift >= 35 {
return Err("varint: too many continuation bytes");
}
let bits = (byte & 0x7F) as u32;
if shift == 28 && bits > 0x0F {
return Err("varint: u32 overflow");
}
value |= bits << shift;
shift += 7;
if byte & 0x80 == 0 {
return Ok((value, i + 1));
}
}
Err("varint: unexpected end of input")
}
#[derive(Debug, Clone)]
pub enum PostingList {
Small(Vec<u8>),
Large(RoaringBitmap),
}
impl PostingList {
pub fn to_vec(&self) -> Result<Vec<u32>, &'static str> {
match self {
PostingList::Small(bytes) => varint_decode(bytes),
PostingList::Large(bm) => Ok(bm.iter().collect()),
}
}
#[must_use = "O(n) for Small variant; use is_empty() or MmapSegment::gram_cardinality() for hot-path checks"]
pub fn len(&self) -> usize {
match self {
PostingList::Small(bytes) => {
varint_decode(bytes).map(|v| v.len()).unwrap_or(0)
}
PostingList::Large(bm) => bm.len() as usize,
}
}
pub fn is_empty(&self) -> bool {
match self {
PostingList::Small(bytes) => bytes.is_empty(),
PostingList::Large(bm) => bm.is_empty(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn varint_round_trip_empty() {
let ids: Vec<u32> = vec![];
assert_eq!(varint_decode(&varint_encode(&ids).unwrap()).unwrap(), ids);
}
#[test]
fn varint_round_trip_single() {
let ids = vec![42u32];
assert_eq!(varint_decode(&varint_encode(&ids).unwrap()).unwrap(), ids);
}
#[test]
fn varint_round_trip_sequential() {
let ids: Vec<u32> = (0u32..100).collect();
assert_eq!(varint_decode(&varint_encode(&ids).unwrap()).unwrap(), ids);
}
#[test]
fn varint_round_trip_large_deltas() {
let ids = vec![0u32, 1_000_000, 2_000_000, u32::MAX - 1, u32::MAX];
assert_eq!(varint_decode(&varint_encode(&ids).unwrap()).unwrap(), ids);
}
#[test]
fn varint_decode_rejects_zero_delta_duplicate() {
let bytes = [5u8, 0u8];
let result = varint_decode(&bytes);
assert!(
result.is_err(),
"zero delta (duplicate id) must be rejected: {result:?}"
);
}
#[test]
fn varint_decode_first_entry_zero_is_ok() {
let bytes = varint_encode(&[0u32, 1u32, 2u32]).unwrap();
let result = varint_decode(&bytes).unwrap();
assert_eq!(result, vec![0, 1, 2]);
}
#[test]
fn varint_encode_rejects_unsorted_input() {
let result = varint_encode(&[5, 3, 7]);
assert_eq!(
result,
Err("varint_encode: ids must be strictly ascending (no duplicates)")
);
}
#[test]
fn varint_encode_rejects_duplicate_ids() {
let result = varint_encode(&[0u32, 0u32]);
assert!(
result.is_err(),
"duplicate doc_ids must be rejected by encode: {result:?}"
);
let result2 = varint_encode(&[5u32, 5u32, 7u32]);
assert!(
result2.is_err(),
"mid-list duplicate must be rejected: {result2:?}"
);
}
#[test]
fn varint_round_trip_no_duplicates() {
let ids = vec![0u32, 1, 5, 100, u32::MAX - 1];
let encoded = varint_encode(&ids).unwrap();
let decoded = varint_decode(&encoded).unwrap();
assert_eq!(decoded, ids);
}
}