use std::io;
use log::debug;
use crate::codecs::codec_util;
use crate::codecs::competitive_impact::{CompetitiveImpactAccumulator, Impact, NormsLookup};
use crate::document::IndexOptions;
use crate::encoding::zigzag;
use crate::index::FieldInfo;
use crate::index::index_file_names::segment_file_name;
use crate::store::{DataOutput, DataOutputWriter, IndexOutput, SharedDirectory, VecOutput};
use super::postings_format::{
self, DOC_CODEC, DOC_EXTENSION, IntBlockTermState, META_CODEC, META_EXTENSION, POS_CODEC,
POS_EXTENSION, VERSION_CURRENT,
};
use crate::encoding::pfor::{self, BLOCK_SIZE};
struct PositionEncoder {
buffer: [i32; BLOCK_SIZE],
buffer_upto: usize,
}
impl PositionEncoder {
fn new() -> Self {
Self {
buffer: [0i32; BLOCK_SIZE],
buffer_upto: 0,
}
}
fn buffer_upto(&self) -> usize {
self.buffer_upto
}
fn add_position(&mut self, pos_delta: i32, pos_out: &mut dyn DataOutput) -> io::Result<()> {
self.buffer[self.buffer_upto] = pos_delta;
self.buffer_upto += 1;
if self.buffer_upto == BLOCK_SIZE {
let mut longs = [0i64; BLOCK_SIZE];
for (i, &val) in self.buffer.iter().enumerate().take(BLOCK_SIZE) {
longs[i] = val as i64;
}
pfor::pfor_encode(&mut longs, &mut DataOutputWriter(pos_out))?;
self.buffer_upto = 0;
}
Ok(())
}
fn finish(
&self,
pos_out: &mut dyn IndexOutput,
total_term_freq: i64,
pos_start_fp: i64,
) -> io::Result<i64> {
let last_pos_block_offset = if total_term_freq > BLOCK_SIZE as i64 {
pos_out.file_pointer() as i64 - pos_start_fp
} else {
-1
};
for &delta in &self.buffer[..self.buffer_upto] {
pos_out.write_vint(delta)?;
}
Ok(last_pos_block_offset)
}
}
struct BlockFlushState {
level0_buf: Vec<u8>,
level1_buf: Vec<u8>,
scratch_buf: Vec<u8>,
bitset_buf: [u64; BLOCK_SIZE / 2],
level0_last_doc_id: i32,
level0_last_pos_fp: i64,
write_freqs: bool,
write_positions: bool,
max_num_impacts_at_level0: i32,
max_impact_num_bytes_at_level0: i32,
max_num_impacts_at_level1: i32,
max_impact_num_bytes_at_level1: i32,
level0_accumulator: CompetitiveImpactAccumulator,
level1_accumulator: CompetitiveImpactAccumulator,
}
impl BlockFlushState {
fn new(pos_start_fp: i64, write_freqs: bool, write_positions: bool) -> Self {
Self {
level0_buf: Vec::new(),
level1_buf: Vec::new(),
scratch_buf: Vec::new(),
bitset_buf: [0u64; BLOCK_SIZE / 2],
level0_last_doc_id: -1,
level0_last_pos_fp: pos_start_fp,
write_freqs,
write_positions,
max_num_impacts_at_level0: 0,
max_impact_num_bytes_at_level0: 0,
max_num_impacts_at_level1: 0,
max_impact_num_bytes_at_level1: 0,
level0_accumulator: CompetitiveImpactAccumulator::new(),
level1_accumulator: CompetitiveImpactAccumulator::new(),
}
}
fn flush_doc_block(
&mut self,
doc_delta_buffer: &[i32; BLOCK_SIZE],
freq_buffer: &mut [i32; BLOCK_SIZE],
last_doc_id: i32,
pos_buffer_upto: usize,
pos_fp: i64,
) -> io::Result<()> {
self.level0_buf.clear();
self.scratch_buf.clear();
if self.write_freqs {
let impacts = self.level0_accumulator.get_competitive_freq_norm_pairs();
let impact_start = self.scratch_buf.len();
let impact_bytes_len = write_impacts(&impacts, &mut self.scratch_buf)?;
if impacts.len() as i32 > self.max_num_impacts_at_level0 {
self.max_num_impacts_at_level0 = impacts.len() as i32;
}
if impact_bytes_len as i32 > self.max_impact_num_bytes_at_level0 {
self.max_impact_num_bytes_at_level0 = impact_bytes_len as i32;
}
{
let mut enc = VecOutput(&mut self.level0_buf);
enc.write_vlong(impact_bytes_len as i64)?;
}
let impact_end = impact_start + impact_bytes_len;
self.level0_buf
.extend_from_slice(&self.scratch_buf[impact_start..impact_end]);
self.scratch_buf.clear();
if self.write_positions {
let mut enc = VecOutput(&mut self.level0_buf);
enc.write_vlong(pos_fp - self.level0_last_pos_fp)?;
enc.write_byte(pos_buffer_upto as u8)?;
}
}
let num_skip_bytes_before_doc = self.level0_buf.len();
let doc_range = (last_doc_id - self.level0_last_doc_id) as usize;
let bpv = pfor::for_delta_bits_required(doc_delta_buffer);
if doc_range == BLOCK_SIZE {
self.level0_buf.push(0u8);
} else {
let num_bitset_longs = bits2words(doc_range);
let num_bits_next_bpv = (bpv + 1).min(32) as usize * BLOCK_SIZE;
if num_bits_next_bpv <= doc_range {
self.level0_buf.push(bpv as u8);
pfor::for_delta_encode(bpv, doc_delta_buffer, &mut self.level0_buf)?;
} else {
assert!(num_bitset_longs <= BLOCK_SIZE / 2);
self.level0_buf.push((-(num_bitset_longs as i8)) as u8);
self.bitset_buf[..num_bitset_longs].fill(0);
let mut s: i32 = -1;
for &delta in doc_delta_buffer.iter() {
s += delta;
let word = s as usize / 64;
let bit = s as usize % 64;
self.bitset_buf[word] |= 1u64 << bit;
}
let mut enc = VecOutput(&mut self.level0_buf);
for &word in &self.bitset_buf[..num_bitset_longs] {
enc.write_le_long(word as i64)?;
}
}
}
if self.write_freqs {
let mut longs = [0i64; BLOCK_SIZE];
for i in 0..BLOCK_SIZE {
longs[i] = freq_buffer[i] as i64;
}
pfor::pfor_encode(&mut longs, &mut self.level0_buf)?;
}
let doc_delta = last_doc_id - self.level0_last_doc_id;
write_vint15(&mut VecOutput(&mut self.scratch_buf), doc_delta)?;
write_vlong15(
&mut VecOutput(&mut self.scratch_buf),
self.level0_buf.len() as i64,
)?;
let num_skip_bytes = num_skip_bytes_before_doc + self.scratch_buf.len();
{
let mut enc = VecOutput(&mut self.level1_buf);
enc.write_vlong(num_skip_bytes as i64)?;
}
self.level1_buf.extend_from_slice(&self.scratch_buf);
self.level1_buf.extend_from_slice(&self.level0_buf);
self.level0_last_doc_id = last_doc_id;
self.level0_last_pos_fp = pos_fp;
if self.write_freqs {
self.level1_accumulator.add_all(&self.level0_accumulator);
}
self.level0_accumulator.clear();
Ok(())
}
}
pub struct PostingsWriter {
doc_out: Box<dyn IndexOutput>,
pos_out: Option<Box<dyn IndexOutput>>,
meta_out: Box<dyn IndexOutput>,
max_num_impacts_at_level0: i32,
max_impact_num_bytes_at_level0: i32,
max_num_impacts_at_level1: i32,
max_impact_num_bytes_at_level1: i32,
}
impl PostingsWriter {
pub fn new(
directory: &SharedDirectory,
segment: &str,
suffix: &str,
id: &[u8; 16],
has_positions: bool,
) -> io::Result<Self> {
let doc_name = segment_file_name(segment, suffix, DOC_EXTENSION);
let meta_name = segment_file_name(segment, suffix, META_EXTENSION);
let (mut doc_out, mut meta_out, mut pos_out) = {
let mut dir = directory.lock().unwrap();
let doc_out = dir.create_output(&doc_name)?;
let meta_out = dir.create_output(&meta_name)?;
let pos_out = if has_positions {
let pos_name = segment_file_name(segment, suffix, POS_EXTENSION);
Some(dir.create_output(&pos_name)?)
} else {
None
};
(doc_out, meta_out, pos_out)
};
codec_util::write_index_header(&mut *doc_out, DOC_CODEC, VERSION_CURRENT, id, suffix)?;
codec_util::write_index_header(&mut *meta_out, META_CODEC, VERSION_CURRENT, id, suffix)?;
if let Some(ref mut pos_out) = pos_out {
codec_util::write_index_header(&mut **pos_out, POS_CODEC, VERSION_CURRENT, id, suffix)?;
}
Ok(Self {
doc_out,
pos_out,
meta_out,
max_num_impacts_at_level0: 0,
max_impact_num_bytes_at_level0: 0,
max_num_impacts_at_level1: 0,
max_impact_num_bytes_at_level1: 0,
})
}
pub fn write_term(
&mut self,
postings: &[(i32, i32, &[i32])], index_options: IndexOptions,
norms: &NormsLookup,
) -> io::Result<IntBlockTermState> {
let doc_freq = postings.len() as i32;
let write_freqs = index_options.has_freqs();
let write_positions = index_options.has_positions();
let total_term_freq: i64 = if write_freqs {
postings.iter().map(|&(_, freq, _)| freq as i64).sum()
} else {
-1
};
let doc_start_fp = self.doc_out.file_pointer() as i64;
let pos_start_fp = self
.pos_out
.as_ref()
.map(|p| p.file_pointer() as i64)
.unwrap_or(0);
if doc_freq == 1 {
let singleton_doc_id = postings[0].0;
debug!(
"postings_writer: singleton term, doc_id={}",
singleton_doc_id
);
let mut last_pos_block_offset = -1i64;
if write_positions && let Some(ref mut pos_out) = self.pos_out {
let (_, _, positions) = postings[0];
let mut pe = PositionEncoder::new();
let mut last_pos = 0i32;
for &pos in positions {
pe.add_position(pos - last_pos, &mut **pos_out)?;
last_pos = pos;
}
last_pos_block_offset = pe.finish(&mut **pos_out, total_term_freq, pos_start_fp)?;
}
Ok(IntBlockTermState {
doc_freq,
total_term_freq,
doc_start_fp,
pos_start_fp,
last_pos_block_offset,
singleton_doc_id,
})
} else if doc_freq >= BLOCK_SIZE as i32 {
debug!("postings_writer: block encoding, doc_freq={}", doc_freq);
self.write_term_blocks(
postings,
doc_freq,
total_term_freq,
doc_start_fp,
pos_start_fp,
write_freqs,
write_positions,
norms,
)
} else {
let mut doc_deltas: Vec<i32> = Vec::with_capacity(doc_freq as usize);
let mut freqs: Vec<i32> = Vec::with_capacity(doc_freq as usize);
let mut last_doc_id = -1i32;
for &(doc_id, freq, _) in postings {
let delta = doc_id - last_doc_id;
doc_deltas.push(delta);
freqs.push(freq);
last_doc_id = doc_id;
}
write_vint_block(
&mut *self.doc_out,
&mut doc_deltas,
&freqs,
doc_freq as usize,
write_freqs,
)?;
debug!(
"postings_writer: VInt block, doc_freq={}, bytes_written={}",
doc_freq,
self.doc_out.file_pointer() as i64 - doc_start_fp
);
let mut last_pos_block_offset = -1i64;
if write_positions && let Some(ref mut pos_out) = self.pos_out {
let mut pe = PositionEncoder::new();
for &(_, _, positions) in postings {
let mut last_pos = 0i32;
for &pos in positions {
pe.add_position(pos - last_pos, &mut **pos_out)?;
last_pos = pos;
}
}
last_pos_block_offset = pe.finish(&mut **pos_out, total_term_freq, pos_start_fp)?;
}
Ok(IntBlockTermState {
doc_freq,
total_term_freq,
doc_start_fp,
pos_start_fp,
last_pos_block_offset,
singleton_doc_id: -1,
})
}
}
#[allow(clippy::too_many_arguments)]
fn write_term_blocks(
&mut self,
postings: &[(i32, i32, &[i32])],
doc_freq: i32,
total_term_freq: i64,
doc_start_fp: i64,
pos_start_fp: i64,
write_freqs: bool,
write_positions: bool,
norms: &NormsLookup,
) -> io::Result<IntBlockTermState> {
let mut doc_delta_buffer = [0i32; BLOCK_SIZE];
let mut freq_buffer = [0i32; BLOCK_SIZE];
let mut doc_buffer_upto = 0usize;
let mut pe = PositionEncoder::new();
let mut bfs = BlockFlushState::new(pos_start_fp, write_freqs, write_positions);
let mut last_doc_id = -1i32;
let mut doc_count = 0usize;
for &(doc_id, freq, positions) in postings {
let doc_delta = doc_id - last_doc_id;
doc_delta_buffer[doc_buffer_upto] = doc_delta;
if write_freqs {
freq_buffer[doc_buffer_upto] = freq;
bfs.level0_accumulator.add(freq, norms.get(doc_id));
}
last_doc_id = doc_id;
if write_positions && let Some(ref mut pos_out) = self.pos_out {
let mut last_pos = 0i32;
for &pos in positions {
pe.add_position(pos - last_pos, &mut **pos_out)?;
last_pos = pos;
}
}
doc_buffer_upto += 1;
doc_count += 1;
if doc_buffer_upto == BLOCK_SIZE {
bfs.flush_doc_block(
&doc_delta_buffer,
&mut freq_buffer,
doc_id,
pe.buffer_upto(),
self.pos_out
.as_ref()
.map(|p| p.file_pointer() as i64)
.unwrap_or(0),
)?;
doc_buffer_upto = 0;
}
}
if doc_buffer_upto > 0 {
let mut tail_deltas = doc_delta_buffer[..doc_buffer_upto].to_vec();
write_vint_block_to_buf(
&mut bfs.level1_buf,
&mut tail_deltas,
&freq_buffer[..doc_buffer_upto],
doc_buffer_upto,
write_freqs,
)?;
}
assert!(
doc_count <= postings_format::LEVEL1_NUM_DOCS,
"doc_count {} exceeds LEVEL1_NUM_DOCS {} — level1 skip not yet implemented",
doc_count,
postings_format::LEVEL1_NUM_DOCS,
);
self.doc_out.write_bytes(&bfs.level1_buf)?;
let mut last_pos_block_offset = -1i64;
if write_positions && let Some(ref mut pos_out) = self.pos_out {
last_pos_block_offset = pe.finish(&mut **pos_out, total_term_freq, pos_start_fp)?;
}
self.max_num_impacts_at_level0 = self
.max_num_impacts_at_level0
.max(bfs.max_num_impacts_at_level0);
self.max_impact_num_bytes_at_level0 = self
.max_impact_num_bytes_at_level0
.max(bfs.max_impact_num_bytes_at_level0);
self.max_num_impacts_at_level1 = self
.max_num_impacts_at_level1
.max(bfs.max_num_impacts_at_level1);
self.max_impact_num_bytes_at_level1 = self
.max_impact_num_bytes_at_level1
.max(bfs.max_impact_num_bytes_at_level1);
Ok(IntBlockTermState {
doc_freq,
total_term_freq,
doc_start_fp,
pos_start_fp,
last_pos_block_offset,
singleton_doc_id: -1,
})
}
pub fn encode_term(
&self,
out: &mut Vec<u8>,
_field_info: &FieldInfo,
state: &IntBlockTermState,
last_state: &IntBlockTermState,
write_positions: bool,
) -> io::Result<()> {
let mut buf = VecOutput(out);
if last_state.singleton_doc_id != -1
&& state.singleton_doc_id != -1
&& state.doc_start_fp == last_state.doc_start_fp
{
let delta = state.singleton_doc_id as i64 - last_state.singleton_doc_id as i64;
let encoded = zigzag::encode_i64(delta);
buf.write_vlong((encoded << 1) | 0x01)?;
} else {
buf.write_vlong((state.doc_start_fp - last_state.doc_start_fp) << 1)?;
if state.singleton_doc_id != -1 {
buf.write_vint(state.singleton_doc_id)?;
}
}
if write_positions {
buf.write_vlong(state.pos_start_fp - last_state.pos_start_fp)?;
}
if write_positions && state.last_pos_block_offset != -1 {
buf.write_vlong(state.last_pos_block_offset)?;
}
Ok(())
}
pub fn finish(mut self) -> io::Result<Vec<String>> {
let mut names = Vec::new();
codec_util::write_footer(&mut *self.doc_out)?;
if let Some(ref mut pos_out) = self.pos_out {
codec_util::write_footer(&mut **pos_out)?;
}
self.meta_out.write_le_int(self.max_num_impacts_at_level0)?;
self.meta_out
.write_le_int(self.max_impact_num_bytes_at_level0)?;
self.meta_out.write_le_int(self.max_num_impacts_at_level1)?;
self.meta_out
.write_le_int(self.max_impact_num_bytes_at_level1)?;
self.meta_out
.write_le_long(self.doc_out.file_pointer() as i64)?;
if let Some(ref pos_out) = self.pos_out {
self.meta_out.write_le_long(pos_out.file_pointer() as i64)?;
}
codec_util::write_footer(&mut *self.meta_out)?;
names.push(self.doc_out.name().to_string());
if let Some(ref pos_out) = self.pos_out {
names.push(pos_out.name().to_string());
}
names.push(self.meta_out.name().to_string());
Ok(names)
}
}
fn write_vint_block(
out: &mut dyn DataOutput,
doc_deltas: &mut [i32],
freqs: &[i32],
num: usize,
write_freqs: bool,
) -> io::Result<()> {
write_vint_block_impl(out, doc_deltas, freqs, num, write_freqs)
}
fn write_vlong15(out: &mut dyn DataOutput, v: i64) -> io::Result<()> {
debug_assert!(v >= 0);
if (v & !0x7FFF_i64) == 0 {
out.write_le_short(v as i16)?;
} else {
out.write_le_short((0x8000 | (v & 0x7FFF)) as i16)?;
out.write_vlong(v >> 15)?;
}
Ok(())
}
fn write_vint15(out: &mut dyn DataOutput, v: i32) -> io::Result<()> {
debug_assert!(v >= 0);
write_vlong15(out, v as i64)
}
fn write_impacts(impacts: &[Impact], buf: &mut Vec<u8>) -> io::Result<usize> {
let start = buf.len();
let mut enc = VecOutput(buf);
let mut prev = Impact { freq: 0, norm: 0 };
for &impact in impacts {
let freq_delta = impact.freq - prev.freq - 1;
let norm_delta = impact.norm - prev.norm - 1;
if norm_delta == 0 {
enc.write_vint(freq_delta << 1)?;
} else {
enc.write_vint((freq_delta << 1) | 1)?;
enc.write_zlong(norm_delta)?;
}
prev = impact;
}
let bytes_written = enc.0.len() - start;
Ok(bytes_written)
}
fn bits2words(num_bits: usize) -> usize {
num_bits.div_ceil(64)
}
fn write_vint_block_to_buf(
buf: &mut Vec<u8>,
doc_deltas: &mut [i32],
freqs: &[i32],
num: usize,
write_freqs: bool,
) -> io::Result<()> {
let mut enc = VecOutput(buf);
write_vint_block_impl(&mut enc, doc_deltas, freqs, num, write_freqs)
}
fn write_vint_block_impl(
out: &mut dyn DataOutput,
doc_deltas: &mut [i32],
freqs: &[i32],
num: usize,
write_freqs: bool,
) -> io::Result<()> {
if write_freqs {
for i in 0..num {
doc_deltas[i] = (doc_deltas[i] << 1) | if freqs[i] == 1 { 1 } else { 0 };
}
}
out.write_group_vints(doc_deltas, num)?;
if write_freqs {
for &freq in &freqs[..num] {
if freq != 1 {
out.write_vint(freq)?;
}
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::index::PointDimensionConfig;
use crate::store::{MemoryDirectory, SharedDirectory};
fn test_directory() -> SharedDirectory {
SharedDirectory::new(Box::new(MemoryDirectory::new()))
}
#[test]
fn test_singleton_term() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings = vec![(5, 1, [].as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 1);
assert_eq!(state.singleton_doc_id, 5);
let header_size = codec_util::index_header_length(DOC_CODEC, "");
assert_eq!(pw.doc_out.file_pointer() as usize, header_size);
}
#[test]
fn test_multi_doc_term_with_freqs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings = vec![
(0, 1, [].as_slice()),
(5, 3, [].as_slice()),
(10, 1, [].as_slice()),
];
let header_size = pw.doc_out.file_pointer();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 3);
assert_eq!(state.singleton_doc_id, -1);
assert_eq!(state.total_term_freq, 5); assert_eq!(state.doc_start_fp, header_size as i64);
assert_gt!(pw.doc_out.file_pointer(), header_size);
}
#[test]
fn test_multi_doc_term_docs_only() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings = vec![(7, 1, [].as_slice()), (11, 1, [].as_slice())];
let header_size = pw.doc_out.file_pointer();
let state = pw
.write_term(&postings, IndexOptions::Docs, &NormsLookup::no_norms())
.unwrap();
assert_eq!(state.doc_freq, 2);
assert_eq!(state.singleton_doc_id, -1);
assert_eq!(state.total_term_freq, -1); assert_gt!(pw.doc_out.file_pointer(), header_size);
}
#[test]
fn test_term_with_positions_vint_tail() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions0 = vec![0, 5, 10];
let positions1 = vec![3];
let postings = vec![(0, 3, positions0.as_slice()), (2, 1, positions1.as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 2);
assert_eq!(state.total_term_freq, 4);
assert_ge!(state.pos_start_fp, 0);
assert_eq!(state.last_pos_block_offset, -1);
let names = pw.finish().unwrap();
let pos_name = names.iter().find(|n| n.ends_with(".pos")).unwrap();
let pos_data = dir.lock().unwrap().read_file(pos_name).unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let footer_size = codec_util::FOOTER_LENGTH;
let pos_bytes = &pos_data[pos_header_size..pos_data.len() - footer_size];
assert_len_eq_x!(&pos_bytes, 4);
assert_eq!(pos_bytes[0], 0); assert_eq!(pos_bytes[1], 5); assert_eq!(pos_bytes[2], 5); assert_eq!(pos_bytes[3], 3); }
#[test]
fn test_term_with_positions_pfor_one_block() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions: Vec<i32> = (0..postings_format::BLOCK_SIZE as i32).collect();
let postings = vec![(0, postings_format::BLOCK_SIZE as i32, positions.as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 1);
assert_eq!(state.total_term_freq, postings_format::BLOCK_SIZE as i64);
assert_eq!(state.singleton_doc_id, 0);
assert_eq!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let pos_data_len = pos_out.file_pointer() as usize - pos_header_size;
assert_gt!(pos_data_len, 0);
}
#[test]
fn test_term_with_positions_pfor_plus_tail() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions0: Vec<i32> = (0..128).collect();
let positions1: Vec<i32> = vec![0, 5];
let postings = vec![
(0, 128, positions0.as_slice()),
(1, 2, positions1.as_slice()),
];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 2);
assert_eq!(state.total_term_freq, 130);
assert_ne!(state.last_pos_block_offset, -1);
assert_gt!(state.last_pos_block_offset, 0);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let pos_data_len = pos_out.file_pointer() as usize - pos_header_size;
assert_gt!(
pos_data_len,
0,
"PFOR block + VInt tail should produce output"
);
assert_lt!(
state.last_pos_block_offset as usize,
pos_data_len,
"last_pos_block_offset ({}) should be < total pos data ({})",
state.last_pos_block_offset,
pos_data_len
);
}
#[test]
fn test_term_with_positions_pfor_multiple_blocks() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions: Vec<i32> = (0..300).collect();
let postings = vec![(0, 300, positions.as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 1);
assert_eq!(state.total_term_freq, 300);
assert_ne!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let pos_data_len = pos_out.file_pointer() as usize - pos_header_size;
assert_gt!(
pos_data_len,
44,
"2 PFOR blocks + tail should be larger than tail alone"
);
assert_lt!(
state.last_pos_block_offset as usize,
pos_data_len,
"last_pos_block_offset ({}) should be < total pos data ({})",
state.last_pos_block_offset,
pos_data_len
);
}
#[test]
fn test_term_with_positions_exactly_two_blocks() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions: Vec<i32> = (0..256).collect();
let postings = vec![(0, 256, positions.as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.total_term_freq, 256);
assert_ne!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let pos_data_len = pos_out.file_pointer() as usize - pos_header_size;
assert_gt!(pos_data_len, 0);
assert_eq!(
state.last_pos_block_offset as usize, pos_data_len,
"With no VInt tail, last_pos_block_offset should equal total pos data length"
);
}
#[test]
fn test_term_with_positions_multi_doc_block_boundary() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions0: Vec<i32> = (0..100).collect();
let positions1: Vec<i32> = (0..50).collect();
let postings = vec![
(0, 100, positions0.as_slice()),
(1, 50, positions1.as_slice()),
];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.total_term_freq, 150);
assert_ne!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
assert_gt!(pos_out.file_pointer() as usize, pos_header_size);
}
#[test]
fn test_term_with_positions_large_deltas() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions: Vec<i32> = (0..128).map(|i| i * 1000).collect();
let postings = vec![(0, 128, positions.as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.total_term_freq, 128);
assert_eq!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let pos_data_len = pos_out.file_pointer() as usize - pos_header_size;
assert!(
pos_data_len > 16 && pos_data_len < 200,
"PFOR block with 10bpv should be ~161 bytes, got {}",
pos_data_len
);
}
#[test]
fn test_term_with_positions_pfor_compresses_real_data() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let mut positions = Vec::with_capacity(256);
let mut pos = 0i32;
for i in 0..256 {
positions.push(pos);
pos += match i % 10 {
0 => 15,
1 => 8,
_ => 1 + (i % 3),
};
}
let postings = vec![(0, 256, positions.as_slice())];
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.total_term_freq, 256);
assert_ne!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
let pos_data_len = pos_out.file_pointer() as usize - pos_header_size;
assert_lt!(
pos_data_len,
256,
"PFOR should compress 256 varied position deltas below 256 bytes, got {}",
pos_data_len
);
}
#[test]
fn test_encode_term_singleton() {
let dir = test_directory();
let pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let fi = FieldInfo::new(
"test".to_string(),
0,
false,
false,
IndexOptions::DocsAndFreqs,
crate::document::DocValuesType::None,
PointDimensionConfig::default(),
);
let empty = IntBlockTermState::new();
let state = IntBlockTermState {
doc_freq: 1,
total_term_freq: 1,
doc_start_fp: 0,
pos_start_fp: 0,
last_pos_block_offset: -1,
singleton_doc_id: 5,
};
let mut buf = Vec::new();
pw.encode_term(&mut buf, &fi, &state, &empty, false)
.unwrap();
assert_eq!(buf[0], 0);
assert_eq!(buf[1], 5);
}
#[test]
fn test_finish_produces_files() {
let dir = test_directory();
let pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let names = pw.finish().unwrap();
assert_len_eq_x!(&names, 3);
assert_ends_with!(names[0], ".doc");
assert_ends_with!(names[1], ".pos");
assert_ends_with!(names[2], ".psm");
for name in &names {
let data = dir.lock().unwrap().read_file(name).unwrap();
assert_ge!(
data.len(),
codec_util::FOOTER_LENGTH,
"file {name} too small: {} bytes",
data.len()
);
}
}
#[test]
fn test_vint15_small_value() {
let mut buf = Vec::new();
write_vint15(&mut VecOutput(&mut buf), 0).unwrap();
assert_len_eq_x!(&buf, 2);
let mut buf = Vec::new();
write_vint15(&mut VecOutput(&mut buf), 0x7FFF).unwrap();
assert_len_eq_x!(&buf, 2);
}
#[test]
fn test_vint15_large_value() {
let mut buf = Vec::new();
write_vint15(&mut VecOutput(&mut buf), 0x8000).unwrap();
assert_gt!(buf.len(), 2); assert_ne!(buf[1] & 0x80, 0, "high bit of short should be set");
}
#[test]
fn test_vlong15_roundtrip() {
for v in [0i64, 1, 127, 128, 0x7FFF, 0x8000, 0xFFFF, 100000] {
let mut buf = Vec::new();
write_vlong15(&mut VecOutput(&mut buf), v).unwrap();
assert_not_empty!(buf);
}
}
#[test]
fn test_write_impacts_single_norm_one() {
let impacts = [Impact { freq: 1, norm: 1 }];
let mut buf = Vec::new();
let len = write_impacts(&impacts, &mut buf).unwrap();
assert_eq!(len, 1);
assert_eq!(buf[0], 0);
let impacts = [Impact { freq: 5, norm: 1 }];
let mut buf = Vec::new();
let len = write_impacts(&impacts, &mut buf).unwrap();
assert_eq!(len, 1);
assert_eq!(buf[0], 8);
let impacts = [Impact { freq: 100, norm: 1 }];
let mut buf = Vec::new();
let len = write_impacts(&impacts, &mut buf).unwrap();
assert_eq!(len, 2);
}
#[test]
fn test_write_impacts_with_norm() {
let impacts = [Impact { freq: 3, norm: 5 }];
let mut buf = Vec::new();
let len = write_impacts(&impacts, &mut buf).unwrap();
assert_ge!(len, 2); assert_eq!(buf[0], 5);
let impacts = [Impact { freq: 3, norm: 5 }, Impact { freq: 10, norm: 13 }];
let mut buf = Vec::new();
let len = write_impacts(&impacts, &mut buf).unwrap();
assert_gt!(len, 2); }
#[test]
fn test_block_encoding_128_docs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i, 1i32, [].as_slice())).collect();
let header_size = pw.doc_out.file_pointer();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 128);
assert_eq!(state.singleton_doc_id, -1);
assert_gt!(pw.doc_out.file_pointer(), header_size);
let written = pw.doc_out.file_pointer() - header_size;
assert_ge!(
written,
8,
"block encoding should produce output, got {written}"
);
}
#[test]
fn test_block_encoding_130_docs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..130).map(|i| (i, 1i32, [].as_slice())).collect();
let header_size = pw.doc_out.file_pointer();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 130);
assert_eq!(state.singleton_doc_id, -1);
let written = pw.doc_out.file_pointer() - header_size;
assert_gt!(
written,
10,
"block + tail should produce output, got {written}"
);
}
#[test]
fn test_block_encoding_256_docs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..256).map(|i| (i, 1i32, [].as_slice())).collect();
let header_size = pw.doc_out.file_pointer();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 256);
let written = pw.doc_out.file_pointer() - header_size;
assert_ge!(written, 16);
}
#[test]
fn test_block_encoding_with_positions() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], true).unwrap();
let positions = vec![0i32, 1];
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i, 2i32, positions.as_slice())).collect();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqsAndPositions,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 128);
assert_eq!(state.total_term_freq, 256); assert_ne!(state.last_pos_block_offset, -1);
let pos_out = pw.pos_out.as_ref().unwrap();
let pos_header_size = codec_util::index_header_length(POS_CODEC, "");
assert_gt!(pos_out.file_pointer() as usize, pos_header_size);
}
#[test]
fn test_block_encoding_consecutive_docs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i, 1i32, [].as_slice())).collect();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 128);
}
#[test]
fn test_block_encoding_sparse_docs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i * 100, 1i32, [].as_slice())).collect();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 128);
assert_eq!(state.singleton_doc_id, -1);
}
#[test]
fn test_block_encoding_docs_only() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i, 1i32, [].as_slice())).collect();
let state = pw
.write_term(&postings, IndexOptions::Docs, &NormsLookup::no_norms())
.unwrap();
assert_eq!(state.doc_freq, 128);
assert_eq!(state.total_term_freq, -1);
assert_eq!(state.singleton_doc_id, -1);
}
#[test]
fn test_block_encoding_varied_freqs() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i, (i % 10) + 1, [].as_slice())).collect();
let state = pw
.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
assert_eq!(state.doc_freq, 128);
let expected_ttf: i64 = (0..128).map(|i: i64| (i % 10) + 1).sum();
assert_eq!(state.total_term_freq, expected_ttf);
}
#[test]
fn test_psm_level1_impact_metadata_zero_below_threshold() {
let dir = test_directory();
let mut pw = PostingsWriter::new(&dir, "_0", "", &[0u8; 16], false).unwrap();
let postings: Vec<(i32, i32, &[i32])> =
(0..128).map(|i| (i, (i % 10) + 1, [].as_slice())).collect();
pw.write_term(
&postings,
IndexOptions::DocsAndFreqs,
&NormsLookup::no_norms(),
)
.unwrap();
let names = pw.finish().unwrap();
let psm_name = names.iter().find(|n| n.ends_with(".psm")).unwrap();
let psm_data = dir.lock().unwrap().read_file(psm_name).unwrap();
let header_size = codec_util::index_header_length(META_CODEC, "");
let meta = &psm_data[header_size..];
let max_num_impacts_l0 = i32::from_le_bytes(meta[0..4].try_into().unwrap());
let max_impact_bytes_l0 = i32::from_le_bytes(meta[4..8].try_into().unwrap());
let max_num_impacts_l1 = i32::from_le_bytes(meta[8..12].try_into().unwrap());
let max_impact_bytes_l1 = i32::from_le_bytes(meta[12..16].try_into().unwrap());
assert_gt!(
max_num_impacts_l0,
0,
"level 0 impact count should be > 0, got {max_num_impacts_l0}"
);
assert_gt!(
max_impact_bytes_l0,
0,
"level 0 impact bytes should be > 0, got {max_impact_bytes_l0}"
);
assert_eq!(
max_num_impacts_l1, 0,
"level 1 impact count should be 0, got {max_num_impacts_l1}"
);
assert_eq!(
max_impact_bytes_l1, 0,
"level 1 impact bytes should be 0, got {max_impact_bytes_l1}"
);
}
}