use crate::core::DocId;
fn encode_vbyte(mut value: u32, out: &mut Vec<u8>) {
loop {
let byte = (value & 0x7F) as u8;
value >>= 7;
if value == 0 {
out.push(byte); break;
}
out.push(byte | 0x80); }
}
fn decode_vbyte(data: &[u8], pos: usize) -> (u32, usize) {
let mut result: u32 = 0;
let mut shift = 0;
let mut i = pos;
loop {
let byte = data[i];
result |= ((byte & 0x7F) as u32) << shift;
i += 1;
if byte & 0x80 == 0 {
break;
}
shift += 7;
}
(result, i - pos)
}
pub struct PostingListWriter {
buf: Vec<u8>,
count: u32,
last_doc_id: u32,
}
impl PostingListWriter {
pub fn new() -> Self {
Self {
buf: Vec::new(),
count: 0,
last_doc_id: 0,
}
}
pub fn add(&mut self, doc_id: DocId, tf: u32) {
let id = doc_id.as_u32();
let delta = if self.count == 0 {
id
} else {
debug_assert!(id > self.last_doc_id, "doc IDs must be strictly increasing");
id - self.last_doc_id
};
encode_vbyte(delta, &mut self.buf);
encode_vbyte(tf, &mut self.buf);
self.last_doc_id = id;
self.count += 1;
}
pub fn finish(self) -> Vec<u8> {
let mut result = Vec::with_capacity(5 + self.buf.len());
result.extend_from_slice(&self.count.to_le_bytes());
result.push(0x00); result.extend_from_slice(&self.buf);
result
}
}
impl Default for PostingListWriter {
fn default() -> Self {
Self::new()
}
}
pub struct PostingListReader<'a> {
data: &'a [u8],
pos: usize,
remaining: u32,
current_doc_id: u32,
has_positions: bool,
}
impl<'a> PostingListReader<'a> {
pub fn new(data: &'a [u8]) -> Self {
let count = if data.len() >= 5 {
u32::from_le_bytes([data[0], data[1], data[2], data[3]])
} else {
0
};
let has_pos = has_positions(data);
let pos = if data.len() >= 5 && data[4] == FLAG_BLOCK_MAX {
let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap()) as usize;
7 + num_blocks * 8 } else {
5 };
Self {
data,
pos,
remaining: count,
current_doc_id: 0,
has_positions: has_pos,
}
}
pub fn len(&self) -> u32 {
if self.data.len() >= 5 {
u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
} else {
0
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn next(&mut self) -> Option<(DocId, u32)> {
if self.remaining == 0 {
return None;
}
let (delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
let (tf, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
if self.has_positions {
for _ in 0..tf {
let (_, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
}
}
self.current_doc_id += delta;
self.remaining -= 1;
Some((DocId(self.current_doc_id), tf))
}
}
pub struct PositionPostingListWriter {
buf: Vec<u8>,
count: u32,
last_doc_id: u32,
}
const FLAG_HAS_POSITIONS: u8 = 0x01;
const FLAG_BLOCK_MAX: u8 = 0x02;
const BLOCK_SIZE: usize = 128;
impl PositionPostingListWriter {
pub fn new() -> Self {
Self {
buf: Vec::new(),
count: 0,
last_doc_id: 0,
}
}
pub fn add(&mut self, doc_id: DocId, positions: &[u32]) {
let id = doc_id.as_u32();
let delta = if self.count == 0 {
id
} else {
debug_assert!(id > self.last_doc_id);
id - self.last_doc_id
};
let tf = positions.len() as u32;
encode_vbyte(delta, &mut self.buf);
encode_vbyte(tf, &mut self.buf);
let mut last_pos = 0u32;
for &pos in positions {
encode_vbyte(pos - last_pos, &mut self.buf);
last_pos = pos;
}
self.last_doc_id = id;
self.count += 1;
}
pub fn finish(self) -> Vec<u8> {
let mut result = Vec::with_capacity(5 + self.buf.len());
result.extend_from_slice(&self.count.to_le_bytes());
result.push(FLAG_HAS_POSITIONS);
result.extend_from_slice(&self.buf);
result
}
}
impl Default for PositionPostingListWriter {
fn default() -> Self {
Self::new()
}
}
pub struct PositionPostingListReader<'a> {
data: &'a [u8],
pos: usize,
remaining: u32,
current_doc_id: u32,
position_buf: Vec<u32>,
cached_first_pos: u32,
cached_tf: u32,
}
impl<'a> PositionPostingListReader<'a> {
pub fn new(data: &'a [u8]) -> Self {
let count = if data.len() >= 5 {
u32::from_le_bytes([data[0], data[1], data[2], data[3]])
} else {
0
};
Self {
data,
pos: 5, remaining: count,
current_doc_id: 0,
position_buf: Vec::new(),
cached_first_pos: 0,
cached_tf: 0,
}
}
pub fn len(&self) -> u32 {
if self.data.len() >= 4 {
u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
} else {
0
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn positions(&self) -> &[u32] {
&self.position_buf
}
#[inline(always)]
pub fn first_position(&self) -> u32 {
self.cached_first_pos
}
#[inline(always)]
pub fn current_tf(&self) -> u32 {
self.cached_tf
}
fn decode_positions(&mut self, tf: u32) {
self.cached_tf = tf;
let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
self.cached_first_pos = first_delta;
if tf == 1 {
self.position_buf.clear();
self.position_buf.push(first_delta);
} else {
self.position_buf.clear();
self.position_buf.push(first_delta);
let mut last_pos = first_delta;
for _ in 1..tf {
let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
last_pos += pos_delta;
self.position_buf.push(last_pos);
}
}
}
fn skip_positions(&mut self, tf: u32) {
for _ in 0..tf {
let (_, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
}
}
pub fn next(&mut self) -> Option<(DocId, Vec<u32>)> {
if self.remaining == 0 {
return None;
}
let (delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
let (tf, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
self.current_doc_id += delta;
self.decode_positions(tf);
self.remaining -= 1;
Some((DocId(self.current_doc_id), self.position_buf.clone()))
}
pub fn advance(&mut self, target: DocId) -> Option<DocId> {
let target_val = target.as_u32();
while self.remaining > 0 {
let (delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
let (tf, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
self.current_doc_id += delta;
self.remaining -= 1;
if self.current_doc_id >= target_val {
self.decode_positions(tf);
return Some(DocId(self.current_doc_id));
}
self.skip_positions(tf);
}
None
}
#[inline(always)]
pub fn next_doc(&mut self) -> Option<DocId> {
if self.remaining == 0 {
return None;
}
let (delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
let (tf, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
self.current_doc_id += delta;
self.cached_tf = tf;
let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
self.cached_first_pos = first_delta;
if tf > 1 {
self.position_buf.clear();
self.position_buf.push(first_delta);
let mut last_pos = first_delta;
for _ in 1..tf {
let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
self.pos += consumed;
last_pos += pos_delta;
self.position_buf.push(last_pos);
}
}
self.remaining -= 1;
Some(DocId(self.current_doc_id))
}
pub fn current_doc_id(&self) -> u32 {
self.current_doc_id
}
}
pub struct BlockMaxPostingListWriter {
entries: Vec<(u32, u32)>, }
impl BlockMaxPostingListWriter {
pub fn new() -> Self {
Self {
entries: Vec::new(),
}
}
pub fn add(&mut self, doc_id: DocId, tf: u32) {
self.entries.push((doc_id.as_u32(), tf));
}
pub fn finish(self) -> Vec<u8> {
let num_docs = self.entries.len() as u32;
let num_blocks = if self.entries.is_empty() {
0u16
} else {
((self.entries.len() + BLOCK_SIZE - 1) / BLOCK_SIZE) as u16
};
let mut block_headers: Vec<(u32, u16, u16)> = Vec::with_capacity(num_blocks as usize);
let mut block_data_bufs: Vec<Vec<u8>> = Vec::with_capacity(num_blocks as usize);
for block_idx in 0..num_blocks as usize {
let start = block_idx * BLOCK_SIZE;
let end = ((block_idx + 1) * BLOCK_SIZE).min(self.entries.len());
let block_entries = &self.entries[start..end];
let last_doc_id = block_entries.last().unwrap().0;
let max_tf = block_entries.iter().map(|e| e.1).max().unwrap();
let max_tf_u16 = if max_tf > u16::MAX as u32 {
u16::MAX
} else {
max_tf as u16
};
let mut buf = Vec::new();
let base_doc_id = if block_idx == 0 {
0u32
} else {
self.entries[start - 1].0
};
let mut prev = base_doc_id;
for &(doc_id, tf) in block_entries {
encode_vbyte(doc_id - prev, &mut buf);
encode_vbyte(tf, &mut buf);
prev = doc_id;
}
let data_len = buf.len() as u16;
block_headers.push((last_doc_id, max_tf_u16, data_len));
block_data_bufs.push(buf);
}
let header_bytes = num_blocks as usize * 8;
let data_bytes: usize = block_data_bufs.iter().map(|b| b.len()).sum();
let mut result = Vec::with_capacity(4 + 1 + 2 + header_bytes + data_bytes);
result.extend_from_slice(&num_docs.to_le_bytes());
result.push(FLAG_BLOCK_MAX);
result.extend_from_slice(&num_blocks.to_le_bytes());
for &(last_doc_id, max_tf, data_len) in &block_headers {
result.extend_from_slice(&last_doc_id.to_le_bytes());
result.extend_from_slice(&max_tf.to_le_bytes());
result.extend_from_slice(&data_len.to_le_bytes());
}
for buf in block_data_bufs {
result.extend_from_slice(&buf);
}
result
}
}
impl Default for BlockMaxPostingListWriter {
fn default() -> Self {
Self::new()
}
}
pub struct BlockMaxPostingListReader<'a> {
data: &'a [u8],
num_docs: u32,
num_blocks: u16,
headers_start: usize,
block_data_offsets: Vec<usize>,
current_block: u16,
pos_in_data: usize,
remaining_in_block: u16,
current_doc_id: u32,
total_remaining: u32,
}
impl<'a> BlockMaxPostingListReader<'a> {
pub fn new(data: &'a [u8]) -> Self {
let num_docs = u32::from_le_bytes(data[0..4].try_into().unwrap());
debug_assert_eq!(data[4], FLAG_BLOCK_MAX);
let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap());
let headers_start = 7;
let block_data_start = headers_start + num_blocks as usize * 8;
let mut block_data_offsets = Vec::with_capacity(num_blocks as usize + 1);
let mut offset = block_data_start;
for i in 0..num_blocks as usize {
block_data_offsets.push(offset);
let hdr_pos = headers_start + i * 8;
let data_len =
u16::from_le_bytes(data[hdr_pos + 6..hdr_pos + 8].try_into().unwrap()) as usize;
offset += data_len;
}
block_data_offsets.push(offset);
let first_block_docs = if num_blocks > 0 {
let total = num_docs as usize;
let _full_blocks = if num_blocks > 1 {
(num_blocks as usize - 1) * BLOCK_SIZE
} else {
0
};
if num_blocks == 1 {
total as u16
} else {
BLOCK_SIZE as u16
}
} else {
0
};
Self {
data,
num_docs,
num_blocks,
headers_start,
block_data_offsets,
current_block: 0,
pos_in_data: block_data_start,
remaining_in_block: first_block_docs,
current_doc_id: 0,
total_remaining: num_docs,
}
}
pub fn len(&self) -> u32 {
self.num_docs
}
pub fn is_empty(&self) -> bool {
self.num_docs == 0
}
pub fn num_blocks(&self) -> u16 {
self.num_blocks
}
pub fn block_max_tf(&self, block: u16) -> u16 {
let hdr_pos = self.headers_start + block as usize * 8;
u16::from_le_bytes(self.data[hdr_pos + 4..hdr_pos + 6].try_into().unwrap())
}
pub fn block_last_doc(&self, block: u16) -> u32 {
let hdr_pos = self.headers_start + block as usize * 8;
u32::from_le_bytes(self.data[hdr_pos..hdr_pos + 4].try_into().unwrap())
}
fn block_doc_count(&self, block: u16) -> u16 {
if (block as usize) < self.num_blocks as usize - 1 {
BLOCK_SIZE as u16
} else {
let full_blocks = (self.num_blocks as usize - 1) * BLOCK_SIZE;
(self.num_docs as usize - full_blocks) as u16
}
}
pub fn advance_shallow(&mut self, target: DocId) {
let target_val = target.as_u32();
let mut lo = self.current_block as usize;
let mut hi = self.num_blocks as usize;
while lo < hi {
let mid = lo + (hi - lo) / 2;
if self.block_last_doc(mid as u16) < target_val {
lo = mid + 1;
} else {
hi = mid;
}
}
if lo < self.num_blocks as usize {
self.current_block = lo as u16;
}
}
pub fn advance_to_block(&mut self, target: DocId) {
let target_val = target.as_u32();
let mut lo = self.current_block as usize;
let mut hi = self.num_blocks as usize;
while lo < hi {
let mid = lo + (hi - lo) / 2;
if self.block_last_doc(mid as u16) < target_val {
lo = mid + 1;
} else {
hi = mid;
}
}
if lo >= self.num_blocks as usize {
self.total_remaining = 0;
self.remaining_in_block = 0;
return;
}
self.seek_to_block(lo as u16);
}
fn seek_to_block(&mut self, block: u16) {
if block >= self.num_blocks {
self.total_remaining = 0;
self.remaining_in_block = 0;
return;
}
let docs_before_block = block as usize * BLOCK_SIZE;
self.total_remaining = self.num_docs.saturating_sub(docs_before_block as u32);
self.remaining_in_block = self.block_doc_count(block);
self.current_block = block;
self.pos_in_data = self.block_data_offsets[block as usize];
if block == 0 {
self.current_doc_id = 0;
} else {
self.current_doc_id = self.block_last_doc(block - 1);
}
}
pub fn next(&mut self) -> Option<(DocId, u32)> {
if self.total_remaining == 0 {
return None;
}
if self.remaining_in_block == 0 {
let next_block = self.current_block + 1;
if next_block >= self.num_blocks {
self.total_remaining = 0;
return None;
}
self.current_block = next_block;
self.remaining_in_block = self.block_doc_count(next_block);
self.pos_in_data = self.block_data_offsets[next_block as usize];
}
let (delta, consumed) = decode_vbyte(self.data, self.pos_in_data);
self.pos_in_data += consumed;
let (tf, consumed) = decode_vbyte(self.data, self.pos_in_data);
self.pos_in_data += consumed;
self.current_doc_id += delta;
self.remaining_in_block -= 1;
self.total_remaining -= 1;
Some((DocId(self.current_doc_id), tf))
}
pub fn current_block_idx(&self) -> u16 {
self.current_block
}
}
pub fn has_block_max(data: &[u8]) -> bool {
data.len() >= 5 && data[4] == FLAG_BLOCK_MAX
}
pub fn has_positions(data: &[u8]) -> bool {
data.len() >= 5 && data[4] == FLAG_HAS_POSITIONS
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn vbyte_single_byte() {
let mut buf = Vec::new();
encode_vbyte(0, &mut buf);
assert_eq!(buf.len(), 1);
let (val, consumed) = decode_vbyte(&buf, 0);
assert_eq!(val, 0);
assert_eq!(consumed, 1);
}
#[test]
fn vbyte_boundary_127() {
let mut buf = Vec::new();
encode_vbyte(127, &mut buf);
assert_eq!(buf.len(), 1);
let (val, _) = decode_vbyte(&buf, 0);
assert_eq!(val, 127);
}
#[test]
fn vbyte_boundary_128() {
let mut buf = Vec::new();
encode_vbyte(128, &mut buf);
assert_eq!(buf.len(), 2);
let (val, consumed) = decode_vbyte(&buf, 0);
assert_eq!(val, 128);
assert_eq!(consumed, 2);
}
#[test]
fn vbyte_16383() {
let mut buf = Vec::new();
encode_vbyte(16383, &mut buf);
assert_eq!(buf.len(), 2);
let (val, _) = decode_vbyte(&buf, 0);
assert_eq!(val, 16383);
}
#[test]
fn vbyte_16384() {
let mut buf = Vec::new();
encode_vbyte(16384, &mut buf);
assert_eq!(buf.len(), 3);
let (val, _) = decode_vbyte(&buf, 0);
assert_eq!(val, 16384);
}
#[test]
fn vbyte_large_value() {
let mut buf = Vec::new();
let val = u32::MAX - 1;
encode_vbyte(val, &mut buf);
assert_eq!(buf.len(), 5);
let (decoded, _) = decode_vbyte(&buf, 0);
assert_eq!(decoded, val);
}
#[test]
fn vbyte_max_value() {
let mut buf = Vec::new();
encode_vbyte(u32::MAX, &mut buf);
let (decoded, _) = decode_vbyte(&buf, 0);
assert_eq!(decoded, u32::MAX);
}
#[test]
fn round_trip_single_doc() {
let mut writer = PostingListWriter::new();
writer.add(DocId(5), 3);
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.len(), 1);
assert_eq!(reader.next(), Some((DocId(5), 3)));
assert_eq!(reader.next(), None);
}
#[test]
fn round_trip_multiple_docs() {
let mut writer = PostingListWriter::new();
writer.add(DocId(1), 2);
writer.add(DocId(5), 1);
writer.add(DocId(100), 4);
writer.add(DocId(1000), 1);
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.len(), 4);
assert_eq!(reader.next(), Some((DocId(1), 2)));
assert_eq!(reader.next(), Some((DocId(5), 1)));
assert_eq!(reader.next(), Some((DocId(100), 4)));
assert_eq!(reader.next(), Some((DocId(1000), 1)));
assert_eq!(reader.next(), None);
}
#[test]
fn round_trip_consecutive_doc_ids() {
let mut writer = PostingListWriter::new();
for i in 0..10 {
writer.add(DocId(i), 1);
}
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.len(), 10);
for i in 0..10 {
assert_eq!(reader.next(), Some((DocId(i), 1)));
}
assert_eq!(reader.next(), None);
}
#[test]
fn round_trip_varying_tf() {
let mut writer = PostingListWriter::new();
writer.add(DocId(0), 0);
writer.add(DocId(1), 1);
writer.add(DocId(2), 127);
writer.add(DocId(3), 128);
writer.add(DocId(4), 10000);
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.next(), Some((DocId(0), 0)));
assert_eq!(reader.next(), Some((DocId(1), 1)));
assert_eq!(reader.next(), Some((DocId(2), 127)));
assert_eq!(reader.next(), Some((DocId(3), 128)));
assert_eq!(reader.next(), Some((DocId(4), 10000)));
}
#[test]
fn empty_posting_list() {
let writer = PostingListWriter::new();
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.len(), 0);
assert!(reader.is_empty());
assert_eq!(reader.next(), None);
}
#[test]
fn large_posting_list() {
let count = 10_000;
let mut writer = PostingListWriter::new();
for i in 0..count {
writer.add(DocId(i * 3), (i % 10) + 1);
}
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.len(), count);
for i in 0..count {
let (doc_id, tf) = reader.next().unwrap();
assert_eq!(doc_id, DocId(i * 3));
assert_eq!(tf, (i % 10) + 1);
}
assert_eq!(reader.next(), None);
}
#[test]
fn delta_encoding_compresses() {
let mut writer = PostingListWriter::new();
for i in 0..1000 {
writer.add(DocId(i), 1);
}
let data = writer.finish();
assert!(data.len() < 3000);
}
#[test]
fn wide_gap_doc_ids() {
let mut writer = PostingListWriter::new();
writer.add(DocId(0), 1);
writer.add(DocId(1_000_000), 1);
writer.add(DocId(2_000_000), 1);
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.next(), Some((DocId(0), 1)));
assert_eq!(reader.next(), Some((DocId(1_000_000), 1)));
assert_eq!(reader.next(), Some((DocId(2_000_000), 1)));
assert_eq!(reader.next(), None);
}
#[test]
fn iterator_exhaustion_is_stable() {
let mut writer = PostingListWriter::new();
writer.add(DocId(1), 1);
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert!(reader.next().is_some());
assert_eq!(reader.next(), None);
assert_eq!(reader.next(), None); assert_eq!(reader.next(), None);
}
#[test]
fn doc_id_starting_at_zero() {
let mut writer = PostingListWriter::new();
writer.add(DocId(0), 5);
let data = writer.finish();
let mut reader = PostingListReader::new(&data);
assert_eq!(reader.next(), Some((DocId(0), 5)));
}
#[test]
fn position_round_trip_single_doc() {
let mut writer = PositionPostingListWriter::new();
writer.add(DocId(0), &[0, 3, 7]);
let data = writer.finish();
assert!(has_positions(&data));
let mut reader = PositionPostingListReader::new(&data);
assert_eq!(reader.len(), 1);
let (doc_id, positions) = reader.next().unwrap();
assert_eq!(doc_id, DocId(0));
assert_eq!(positions, vec![0, 3, 7]);
assert!(reader.next().is_none());
}
#[test]
fn position_round_trip_multiple_docs() {
let mut writer = PositionPostingListWriter::new();
writer.add(DocId(1), &[0, 1]);
writer.add(DocId(5), &[2, 5, 8]);
writer.add(DocId(10), &[0]);
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
assert_eq!(reader.len(), 3);
let (id, pos) = reader.next().unwrap();
assert_eq!(id, DocId(1));
assert_eq!(pos, vec![0, 1]);
let (id, pos) = reader.next().unwrap();
assert_eq!(id, DocId(5));
assert_eq!(pos, vec![2, 5, 8]);
let (id, pos) = reader.next().unwrap();
assert_eq!(id, DocId(10));
assert_eq!(pos, vec![0]);
assert!(reader.next().is_none());
}
#[test]
fn position_consecutive_positions() {
let mut writer = PositionPostingListWriter::new();
writer.add(DocId(0), &[0, 1, 2, 3, 4]);
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
let (_, pos) = reader.next().unwrap();
assert_eq!(pos, vec![0, 1, 2, 3, 4]);
}
#[test]
fn position_gapped_positions() {
let mut writer = PositionPostingListWriter::new();
writer.add(DocId(0), &[0, 100, 200, 5000]);
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
let (_, pos) = reader.next().unwrap();
assert_eq!(pos, vec![0, 100, 200, 5000]);
}
#[test]
fn position_empty_list() {
let writer = PositionPostingListWriter::new();
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
assert_eq!(reader.len(), 0);
assert!(reader.is_empty());
assert!(reader.next().is_none());
}
#[test]
fn position_single_position() {
let mut writer = PositionPostingListWriter::new();
writer.add(DocId(42), &[7]);
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
let (id, pos) = reader.next().unwrap();
assert_eq!(id, DocId(42));
assert_eq!(pos, vec![7]);
}
#[test]
fn position_many_docs() {
let mut writer = PositionPostingListWriter::new();
for i in 0..1000u32 {
writer.add(DocId(i), &[i * 2, i * 2 + 1]);
}
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
assert_eq!(reader.len(), 1000);
for i in 0..1000u32 {
let (id, pos) = reader.next().unwrap();
assert_eq!(id, DocId(i));
assert_eq!(pos, vec![i * 2, i * 2 + 1]);
}
assert!(reader.next().is_none());
}
#[test]
fn has_positions_flag() {
let mut pw = PositionPostingListWriter::new();
pw.add(DocId(0), &[0]);
let pdata = pw.finish();
assert!(has_positions(&pdata));
let mut w = PostingListWriter::new();
w.add(DocId(0), 1);
let data = w.finish();
assert!(!has_positions(&data));
}
#[test]
fn position_exhaustion_stable() {
let mut writer = PositionPostingListWriter::new();
writer.add(DocId(0), &[0]);
let data = writer.finish();
let mut reader = PositionPostingListReader::new(&data);
assert!(reader.next().is_some());
assert!(reader.next().is_none());
assert!(reader.next().is_none());
}
#[test]
fn block_max_single_doc() {
let mut writer = BlockMaxPostingListWriter::new();
writer.add(DocId(5), 3);
let data = writer.finish();
assert!(has_block_max(&data));
let mut reader = BlockMaxPostingListReader::new(&data);
assert_eq!(reader.len(), 1);
assert_eq!(reader.num_blocks(), 1);
assert_eq!(reader.block_last_doc(0), 5);
assert_eq!(reader.block_max_tf(0), 3);
assert_eq!(reader.next(), Some((DocId(5), 3)));
assert_eq!(reader.next(), None);
}
#[test]
fn block_max_under_block_size() {
let mut writer = BlockMaxPostingListWriter::new();
for i in 0..50 {
writer.add(DocId(i * 2), (i % 5) + 1);
}
let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
assert_eq!(reader.len(), 50);
assert_eq!(reader.num_blocks(), 1);
assert_eq!(reader.block_last_doc(0), 98);
assert_eq!(reader.block_max_tf(0), 5);
for i in 0..50 {
assert_eq!(reader.next(), Some((DocId(i * 2), (i % 5) + 1)));
}
assert_eq!(reader.next(), None);
}
#[test]
fn block_max_exact_block_size() {
let mut writer = BlockMaxPostingListWriter::new();
for i in 0..128 {
writer.add(DocId(i), 1);
}
let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
assert_eq!(reader.len(), 128);
assert_eq!(reader.num_blocks(), 1);
assert_eq!(reader.block_last_doc(0), 127);
for i in 0..128 {
assert_eq!(reader.next(), Some((DocId(i), 1)));
}
assert_eq!(reader.next(), None);
}
#[test]
fn block_max_multi_block() {
let mut writer = BlockMaxPostingListWriter::new();
for i in 0..300u32 {
writer.add(DocId(i), (i % 10) + 1);
}
let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
assert_eq!(reader.len(), 300);
assert_eq!(reader.num_blocks(), 3);
assert_eq!(reader.block_last_doc(0), 127);
assert_eq!(reader.block_max_tf(0), 10);
assert_eq!(reader.block_last_doc(1), 255);
assert_eq!(reader.block_last_doc(2), 299);
for i in 0..300u32 {
assert_eq!(reader.next(), Some((DocId(i), (i % 10) + 1)));
}
assert_eq!(reader.next(), None);
}
#[test]
fn block_max_advance_to_block() {
let mut writer = BlockMaxPostingListWriter::new();
for i in 0..300u32 {
writer.add(DocId(i * 3), 1);
}
let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
reader.advance_to_block(DocId(400));
let (doc, _) = reader.next().unwrap();
assert_eq!(doc, DocId(384));
reader.advance_to_block(DocId(800));
let (doc, _) = reader.next().unwrap();
assert_eq!(doc, DocId(768)); }
#[test]
fn block_max_advance_past_end() {
let mut writer = BlockMaxPostingListWriter::new();
for i in 0..10 {
writer.add(DocId(i), 1);
}
let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
reader.advance_to_block(DocId(100));
assert_eq!(reader.next(), None);
}
#[test]
fn block_max_empty() {
let writer = BlockMaxPostingListWriter::new();
let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
assert_eq!(reader.len(), 0);
assert!(reader.is_empty());
assert_eq!(reader.num_blocks(), 0);
assert_eq!(reader.next(), None);
}
#[test]
fn block_max_large_tf_clamped() {
let mut writer = BlockMaxPostingListWriter::new();
writer.add(DocId(0), 70000); let data = writer.finish();
let mut reader = BlockMaxPostingListReader::new(&data);
assert_eq!(reader.block_max_tf(0), u16::MAX); assert_eq!(reader.next(), Some((DocId(0), 70000)));
}
#[test]
fn block_max_flag_detected() {
let mut bm = BlockMaxPostingListWriter::new();
bm.add(DocId(0), 1);
let bm_data = bm.finish();
assert!(has_block_max(&bm_data));
assert!(!has_positions(&bm_data));
let mut basic = PostingListWriter::new();
basic.add(DocId(0), 1);
let basic_data = basic.finish();
assert!(!has_block_max(&basic_data));
}
}