use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::io::{self, Read, Write};
use crate::DocId;
pub const BLOCK_SIZE: usize = 128;
#[inline]
pub fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
loop {
let byte = (value & 0x7F) as u8;
value >>= 7;
if value == 0 {
writer.write_u8(byte)?;
return Ok(());
} else {
writer.write_u8(byte | 0x80)?;
}
}
}
#[inline]
pub fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
let mut result = 0u64;
let mut shift = 0;
loop {
let byte = reader.read_u8()?;
result |= ((byte & 0x7F) as u64) << shift;
if byte & 0x80 == 0 {
return Ok(result);
}
shift += 7;
if shift >= 64 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"varint too long",
));
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SkipEntry {
pub first_doc: DocId,
pub last_doc: DocId,
pub offset: u32,
}
impl SkipEntry {
pub fn new(first_doc: DocId, last_doc: DocId, offset: u32) -> Self {
Self {
first_doc,
last_doc,
offset,
}
}
pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u32::<LittleEndian>(self.first_doc)?;
writer.write_u32::<LittleEndian>(self.last_doc)?;
writer.write_u32::<LittleEndian>(self.offset)?;
Ok(())
}
pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
let first_doc = reader.read_u32::<LittleEndian>()?;
let last_doc = reader.read_u32::<LittleEndian>()?;
let offset = reader.read_u32::<LittleEndian>()?;
Ok(Self {
first_doc,
last_doc,
offset,
})
}
}
#[derive(Debug, Clone, Default)]
pub struct SkipList {
entries: Vec<SkipEntry>,
}
impl SkipList {
pub fn new() -> Self {
Self::default()
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
entries: Vec::with_capacity(capacity),
}
}
pub fn push(&mut self, first_doc: DocId, last_doc: DocId, offset: u32) {
self.entries
.push(SkipEntry::new(first_doc, last_doc, offset));
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn get(&self, index: usize) -> Option<&SkipEntry> {
self.entries.get(index)
}
pub fn find_block(&self, target: DocId) -> Option<usize> {
self.entries.iter().position(|e| e.last_doc >= target)
}
pub fn iter(&self) -> impl Iterator<Item = &SkipEntry> {
self.entries.iter()
}
pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u32::<LittleEndian>(self.entries.len() as u32)?;
for entry in &self.entries {
entry.write(writer)?;
}
Ok(())
}
pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
let count = reader.read_u32::<LittleEndian>()? as usize;
let mut entries = Vec::with_capacity(count);
for _ in 0..count {
entries.push(SkipEntry::read(reader)?);
}
Ok(Self { entries })
}
pub fn from_tuples(tuples: &[(DocId, DocId, u32)]) -> Self {
Self {
entries: tuples
.iter()
.map(|(first, last, offset)| SkipEntry::new(*first, *last, *offset))
.collect(),
}
}
pub fn to_tuples(&self) -> Vec<(DocId, DocId, u32)> {
self.entries
.iter()
.map(|e| (e.first_doc, e.last_doc, e.offset))
.collect()
}
}
pub fn write_doc_id_block<W: Write>(writer: &mut W, doc_ids: &[DocId]) -> io::Result<DocId> {
if doc_ids.is_empty() {
return Ok(0);
}
write_vint(writer, doc_ids.len() as u64)?;
let mut prev = 0u32;
for (i, &doc_id) in doc_ids.iter().enumerate() {
if i == 0 {
write_vint(writer, doc_id as u64)?;
} else {
write_vint(writer, (doc_id - prev) as u64)?;
}
prev = doc_id;
}
Ok(*doc_ids.last().unwrap())
}
pub fn read_doc_id_block<R: Read>(reader: &mut R) -> io::Result<Vec<DocId>> {
let count = read_vint(reader)? as usize;
let mut doc_ids = Vec::with_capacity(count);
let mut prev = 0u32;
for i in 0..count {
let value = read_vint(reader)? as u32;
let doc_id = if i == 0 {
value } else {
prev + value };
doc_ids.push(doc_id);
prev = doc_id;
}
Ok(doc_ids)
}
use crate::structures::simd;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum RoundedBitWidth {
Zero = 0,
Bits8 = 8,
Bits16 = 16,
Bits32 = 32,
}
impl RoundedBitWidth {
pub fn from_max_value(max_val: u32) -> Self {
if max_val == 0 {
RoundedBitWidth::Zero
} else if max_val <= 255 {
RoundedBitWidth::Bits8
} else if max_val <= 65535 {
RoundedBitWidth::Bits16
} else {
RoundedBitWidth::Bits32
}
}
pub fn bytes_per_value(&self) -> usize {
match self {
RoundedBitWidth::Zero => 0,
RoundedBitWidth::Bits8 => 1,
RoundedBitWidth::Bits16 => 2,
RoundedBitWidth::Bits32 => 4,
}
}
pub fn from_u8(v: u8) -> Option<Self> {
match v {
0 => Some(RoundedBitWidth::Zero),
8 => Some(RoundedBitWidth::Bits8),
16 => Some(RoundedBitWidth::Bits16),
32 => Some(RoundedBitWidth::Bits32),
_ => None,
}
}
}
pub fn pack_deltas_fixed(doc_ids: &[DocId]) -> (RoundedBitWidth, Vec<u8>) {
if doc_ids.len() <= 1 {
return (RoundedBitWidth::Zero, Vec::new());
}
let mut max_delta = 0u32;
let mut deltas = Vec::with_capacity(doc_ids.len() - 1);
for i in 1..doc_ids.len() {
let delta = doc_ids[i] - doc_ids[i - 1] - 1; deltas.push(delta);
max_delta = max_delta.max(delta);
}
let bit_width = RoundedBitWidth::from_max_value(max_delta);
let bytes_per_val = bit_width.bytes_per_value();
if bytes_per_val == 0 {
return (bit_width, Vec::new());
}
let mut packed = Vec::with_capacity(deltas.len() * bytes_per_val);
match bit_width {
RoundedBitWidth::Zero => {}
RoundedBitWidth::Bits8 => {
for delta in deltas {
packed.push(delta as u8);
}
}
RoundedBitWidth::Bits16 => {
for delta in deltas {
packed.extend_from_slice(&(delta as u16).to_le_bytes());
}
}
RoundedBitWidth::Bits32 => {
for delta in deltas {
packed.extend_from_slice(&delta.to_le_bytes());
}
}
}
(bit_width, packed)
}
pub fn unpack_deltas_fixed(
packed: &[u8],
bit_width: RoundedBitWidth,
first_doc_id: DocId,
count: usize,
output: &mut [DocId],
) {
if count == 0 {
return;
}
output[0] = first_doc_id;
if count == 1 {
return;
}
match bit_width {
RoundedBitWidth::Zero => {
for (i, out) in output.iter_mut().enumerate().skip(1).take(count - 1) {
*out = first_doc_id + i as u32;
}
}
RoundedBitWidth::Bits8 => {
simd::unpack_8bit_delta_decode(packed, output, first_doc_id, count);
}
RoundedBitWidth::Bits16 => {
simd::unpack_16bit_delta_decode(packed, output, first_doc_id, count);
}
RoundedBitWidth::Bits32 => {
let mut carry = first_doc_id;
for i in 0..count - 1 {
let idx = i * 4;
let delta = u32::from_le_bytes([
packed[idx],
packed[idx + 1],
packed[idx + 2],
packed[idx + 3],
]);
carry = carry.wrapping_add(delta).wrapping_add(1);
output[i + 1] = carry;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vint_roundtrip() {
let values = [
0u64,
1,
127,
128,
255,
256,
16383,
16384,
u32::MAX as u64,
u64::MAX,
];
for &value in &values {
let mut buf = Vec::new();
write_vint(&mut buf, value).unwrap();
let read_value = read_vint(&mut buf.as_slice()).unwrap();
assert_eq!(value, read_value, "Failed for value {}", value);
}
}
#[test]
fn test_skip_list_roundtrip() {
let mut skip_list = SkipList::new();
skip_list.push(0, 127, 0);
skip_list.push(128, 255, 100);
skip_list.push(256, 500, 200);
let mut buf = Vec::new();
skip_list.write(&mut buf).unwrap();
let restored = SkipList::read(&mut buf.as_slice()).unwrap();
assert_eq!(skip_list.len(), restored.len());
for (a, b) in skip_list.iter().zip(restored.iter()) {
assert_eq!(a, b);
}
}
#[test]
fn test_skip_list_find_block() {
let mut skip_list = SkipList::new();
skip_list.push(0, 99, 0);
skip_list.push(100, 199, 100);
skip_list.push(200, 299, 200);
assert_eq!(skip_list.find_block(0), Some(0));
assert_eq!(skip_list.find_block(50), Some(0));
assert_eq!(skip_list.find_block(99), Some(0));
assert_eq!(skip_list.find_block(100), Some(1));
assert_eq!(skip_list.find_block(150), Some(1));
assert_eq!(skip_list.find_block(250), Some(2));
assert_eq!(skip_list.find_block(300), None);
}
#[test]
fn test_doc_id_block_roundtrip() {
let doc_ids: Vec<DocId> = vec![0, 5, 10, 100, 1000, 10000];
let mut buf = Vec::new();
let last = write_doc_id_block(&mut buf, &doc_ids).unwrap();
assert_eq!(last, 10000);
let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
assert_eq!(doc_ids, restored);
}
#[test]
fn test_doc_id_block_single() {
let doc_ids: Vec<DocId> = vec![42];
let mut buf = Vec::new();
write_doc_id_block(&mut buf, &doc_ids).unwrap();
let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
assert_eq!(doc_ids, restored);
}
}