use std::fmt;
use std::io;
use std::mem;
use crate::codecs::fields_producer::{NO_MORE_DOCS, PostingsEnumProducer};
use crate::document::{IndexOptions, TermOffset};
use crate::util::byte_block_pool::{ByteBlockPool, ByteSlicePool, FIRST_LEVEL_SIZE};
use crate::util::bytes_ref_hash::BytesRefHash;
pub(crate) const BYTES_PER_POSTING: usize = 3 * mem::size_of::<i32>();
pub(crate) fn oversize(min_size: usize, bytes_per_posting: usize) -> usize {
let extra = min_size >> 3;
let new_size = min_size + extra;
let remainder = new_size % bytes_per_posting;
if remainder != 0 {
new_size + bytes_per_posting - remainder
} else {
new_size
}
}
#[derive(Debug)]
pub(crate) struct ParallelPostingsArray {
pub text_starts: Vec<i32>,
pub address_offset: Vec<i32>,
pub byte_starts: Vec<i32>,
}
impl ParallelPostingsArray {
pub(crate) fn new(size: usize) -> Self {
Self {
text_starts: vec![0; size],
address_offset: vec![0; size],
byte_starts: vec![0; size],
}
}
pub(crate) fn size(&self) -> usize {
self.text_starts.len()
}
pub(crate) fn bytes_per_posting(&self) -> usize {
BYTES_PER_POSTING
}
#[expect(dead_code)]
pub(crate) fn grow(&self) -> Self {
let new_size = oversize(self.size() + 1, self.bytes_per_posting());
let mut new_array = Self::new(new_size);
self.copy_to(&mut new_array, self.size());
new_array
}
pub(crate) fn copy_to(&self, to_array: &mut ParallelPostingsArray, num_to_copy: usize) {
to_array.text_starts[..num_to_copy].copy_from_slice(&self.text_starts[..num_to_copy]);
to_array.address_offset[..num_to_copy].copy_from_slice(&self.address_offset[..num_to_copy]);
to_array.byte_starts[..num_to_copy].copy_from_slice(&self.byte_starts[..num_to_copy]);
}
}
#[derive(Debug)]
pub(crate) struct FreqProxPostingsArray {
pub base: ParallelPostingsArray,
pub term_freqs: Option<Vec<i32>>,
pub last_doc_ids: Vec<i32>,
pub last_doc_codes: Vec<i32>,
pub last_positions: Option<Vec<i32>>,
pub last_offsets: Option<Vec<i32>>,
}
impl FreqProxPostingsArray {
pub(crate) fn new(
size: usize,
write_freqs: bool,
write_prox: bool,
write_offsets: bool,
) -> Self {
let term_freqs = if write_freqs {
Some(vec![0; size])
} else {
None
};
let last_positions = if write_prox {
Some(vec![0; size])
} else {
assert!(!write_offsets);
None
};
let last_offsets = if write_offsets {
Some(vec![0; size])
} else {
None
};
Self {
base: ParallelPostingsArray::new(size),
term_freqs,
last_doc_ids: vec![0; size],
last_doc_codes: vec![0; size],
last_positions,
last_offsets,
}
}
pub(crate) fn size(&self) -> usize {
self.base.size()
}
pub(crate) fn bytes_per_posting(&self) -> usize {
let mut bytes = self.base.bytes_per_posting();
bytes += 2 * mem::size_of::<i32>();
if self.term_freqs.is_some() {
bytes += mem::size_of::<i32>();
}
if self.last_positions.is_some() {
bytes += mem::size_of::<i32>();
}
if self.last_offsets.is_some() {
bytes += mem::size_of::<i32>();
}
bytes
}
pub(crate) fn grow(&self) -> Self {
let new_size = oversize(self.size() + 1, self.bytes_per_posting());
let mut new_array = Self::new(
new_size,
self.term_freqs.is_some(),
self.last_positions.is_some(),
self.last_offsets.is_some(),
);
self.copy_to(&mut new_array, self.size());
new_array
}
pub(crate) fn copy_to(&self, to_array: &mut FreqProxPostingsArray, num_to_copy: usize) {
self.base.copy_to(&mut to_array.base, num_to_copy);
to_array.last_doc_ids[..num_to_copy].copy_from_slice(&self.last_doc_ids[..num_to_copy]);
to_array.last_doc_codes[..num_to_copy].copy_from_slice(&self.last_doc_codes[..num_to_copy]);
if let (Some(from), Some(to)) = (&self.last_positions, &mut to_array.last_positions) {
to[..num_to_copy].copy_from_slice(&from[..num_to_copy]);
}
if let (Some(from), Some(to)) = (&self.last_offsets, &mut to_array.last_offsets) {
to[..num_to_copy].copy_from_slice(&from[..num_to_copy]);
}
if let (Some(from), Some(to)) = (&self.term_freqs, &mut to_array.term_freqs) {
to[..num_to_copy].copy_from_slice(&from[..num_to_copy]);
}
}
}
const HASH_INIT_SIZE: usize = 4;
pub(crate) struct TermsHash {
pub(crate) int_pool: Vec<i32>,
pub(crate) byte_pool: ByteBlockPool,
}
impl TermsHash {
pub(crate) fn new() -> Self {
Self {
int_pool: Vec::with_capacity(8192),
byte_pool: ByteBlockPool::new(32 * 1024),
}
}
pub(crate) fn reset(&mut self) {
const BYTE_POOL_SHRINK_THRESHOLD: usize = 64 * 1024;
const INT_POOL_SHRINK_THRESHOLD: usize = BYTE_POOL_SHRINK_THRESHOLD / mem::size_of::<i32>();
if self.byte_pool.data.capacity() > BYTE_POOL_SHRINK_THRESHOLD {
self.byte_pool.data = Vec::with_capacity(BYTE_POOL_SHRINK_THRESHOLD);
} else {
self.byte_pool.reset();
}
if self.int_pool.capacity() > INT_POOL_SHRINK_THRESHOLD {
self.int_pool = Vec::with_capacity(INT_POOL_SHRINK_THRESHOLD);
} else {
self.int_pool.clear();
}
}
}
impl Default for TermsHash {
fn default() -> Self {
Self::new()
}
}
impl fmt::Debug for TermsHash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("TermsHash")
.field("int_pool_len", &self.int_pool.len())
.field("byte_pool_len", &self.byte_pool.data.len())
.finish()
}
}
impl mem_dbg::MemSize for TermsHash {
fn mem_size_rec(
&self,
flags: mem_dbg::SizeFlags,
refs: &mut mem_dbg::HashMap<usize, usize>,
) -> usize {
self.int_pool.len() * mem::size_of::<i32>() + self.byte_pool.mem_size_rec(flags, refs)
}
}
pub(crate) struct TermsHashPerField {
pub(crate) stream_address_offset: usize,
pub(crate) stream_count: usize,
field_name: String,
pub(crate) bytes_hash: BytesRefHash,
sorted_term_ids: Option<Vec<i32>>,
pub(crate) last_doc_id: i32, }
impl TermsHashPerField {
pub(crate) fn new(
stream_count: usize,
field_name: String,
index_options: IndexOptions,
) -> Self {
assert!(index_options != IndexOptions::None);
let bytes_hash = BytesRefHash::new(HASH_INIT_SIZE);
Self {
stream_address_offset: 0,
stream_count,
field_name,
bytes_hash,
sorted_term_ids: None,
last_doc_id: -1,
}
}
pub(crate) fn reset(&mut self) {
self.bytes_hash.clear();
self.sorted_term_ids = None;
}
pub(crate) fn sort_terms(&mut self, byte_pool: &ByteBlockPool) {
assert!(self.sorted_term_ids.is_none());
self.sorted_term_ids = Some(self.bytes_hash.sort(byte_pool));
}
pub(crate) fn sorted_term_ids(&self) -> &[i32] {
self.sorted_term_ids
.as_ref()
.expect("sort_terms not called")
}
pub(crate) fn num_terms(&self) -> usize {
self.bytes_hash.size()
}
pub(crate) fn field_name(&self) -> &str {
&self.field_name
}
pub(crate) fn term_bytes<'a>(&self, byte_pool: &'a ByteBlockPool, term_id: usize) -> &'a [u8] {
self.bytes_hash.get(byte_pool, term_id)
}
pub(crate) fn write_byte(&mut self, terms_hash: &mut TermsHash, stream: usize, b: u8) {
let stream_address = self.stream_address_offset + stream;
let upto = terms_hash.int_pool[stream_address] as usize;
if terms_hash.byte_pool.data[upto] != 0 {
let new_offset = ByteSlicePool::alloc_slice(&mut terms_hash.byte_pool, upto);
terms_hash.int_pool[stream_address] = new_offset as i32;
terms_hash.byte_pool.data[new_offset] = b;
terms_hash.int_pool[stream_address] += 1;
} else {
terms_hash.byte_pool.data[upto] = b;
terms_hash.int_pool[stream_address] += 1;
}
}
#[expect(dead_code)]
pub(crate) fn write_bytes(&mut self, terms_hash: &mut TermsHash, stream: usize, data: &[u8]) {
let end = data.len();
let stream_address = self.stream_address_offset + stream;
let mut upto = terms_hash.int_pool[stream_address] as usize;
let mut offset = 0;
while terms_hash.byte_pool.data[upto] == 0 && offset < end {
terms_hash.byte_pool.data[upto] = data[offset];
upto += 1;
offset += 1;
terms_hash.int_pool[stream_address] += 1;
}
while offset < end {
let (new_slice_offset, slice_length) =
ByteSlicePool::alloc_known_size_slice(&mut terms_hash.byte_pool, upto);
let write_length = (slice_length - 1).min(end - offset);
terms_hash.byte_pool.data[new_slice_offset..new_slice_offset + write_length]
.copy_from_slice(&data[offset..offset + write_length]);
upto = new_slice_offset + write_length;
offset += write_length;
terms_hash.int_pool[stream_address] = upto as i32;
}
}
pub(crate) fn write_v_int(&mut self, terms_hash: &mut TermsHash, stream: usize, mut i: i32) {
assert!(stream < self.stream_count);
while (i & !0x7F) != 0 {
self.write_byte(terms_hash, stream, ((i & 0x7F) | 0x80) as u8);
i = ((i as u32) >> 7) as i32;
}
self.write_byte(terms_hash, stream, i as u8);
}
}
impl fmt::Debug for TermsHashPerField {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("TermsHashPerField")
.field("field_name", &self.field_name)
.field("stream_count", &self.stream_count)
.field("num_terms", &self.bytes_hash.size())
.finish()
}
}
pub(crate) trait TermsHashPerFieldTrait {
fn base(&self) -> &TermsHashPerField;
fn base_mut(&mut self) -> &mut TermsHashPerField;
fn postings_array_base(&self) -> &ParallelPostingsArray;
fn postings_array_base_mut(&mut self) -> &mut ParallelPostingsArray;
fn ensure_postings_capacity(&mut self, term_id: usize);
fn new_term(&mut self, terms_hash: &mut TermsHash, term_id: usize, doc_id: i32);
fn add_term(&mut self, terms_hash: &mut TermsHash, term_id: usize, doc_id: i32);
fn add(
&mut self,
term_byte_pool: &mut ByteBlockPool,
terms_hash: &mut TermsHash,
term_bytes: &[u8],
doc_id: i32,
) -> usize {
{
let base = self.base_mut();
debug_assert!(doc_id >= base.last_doc_id);
base.last_doc_id = doc_id;
}
let term_id = self.base_mut().bytes_hash.add(term_byte_pool, term_bytes);
if term_id >= 0 {
let tid = term_id as usize;
self.init_stream_slices(terms_hash, tid, doc_id);
tid
} else {
let tid = ((-term_id) - 1) as usize;
self.position_stream_slice(terms_hash, tid, doc_id);
tid
}
}
fn add_by_text_start(&mut self, terms_hash: &mut TermsHash, text_start: i32, doc_id: i32) {
let term_id = self.base_mut().bytes_hash.add_by_pool_offset(text_start);
if term_id >= 0 {
let tid = term_id as usize;
self.init_stream_slices(terms_hash, tid, doc_id);
} else {
let tid = ((-term_id) - 1) as usize;
self.position_stream_slice(terms_hash, tid, doc_id);
}
}
fn init_stream_slices(&mut self, terms_hash: &mut TermsHash, term_id: usize, doc_id: i32) {
self.ensure_postings_capacity(term_id);
let byte_start = self.base().bytes_hash.byte_start(term_id);
self.postings_array_base_mut().text_starts[term_id] = byte_start;
let stream_count = self.base().stream_count;
let stream_address_offset = terms_hash.int_pool.len();
terms_hash
.int_pool
.resize(stream_address_offset + stream_count, 0);
self.base_mut().stream_address_offset = stream_address_offset;
self.postings_array_base_mut().address_offset[term_id] = stream_address_offset as i32;
for i in 0..stream_count {
let upto = ByteSlicePool::new_slice(&mut terms_hash.byte_pool, FIRST_LEVEL_SIZE);
terms_hash.int_pool[stream_address_offset + i] = upto as i32;
}
let byte_starts = terms_hash.int_pool[stream_address_offset];
self.postings_array_base_mut().byte_starts[term_id] = byte_starts;
self.new_term(terms_hash, term_id, doc_id);
}
fn position_stream_slice(&mut self, terms_hash: &mut TermsHash, term_id: usize, doc_id: i32) {
let int_start = self.postings_array_base().address_offset[term_id] as usize;
self.base_mut().stream_address_offset = int_start;
self.add_term(terms_hash, term_id, doc_id);
}
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct DecodedDoc {
pub doc_id: i32,
pub freq: i32,
pub pos_start: u32,
}
pub(crate) struct DecodedPostings {
pub docs: Vec<DecodedDoc>,
pub positions: Vec<i32>,
pub offsets: Vec<TermOffset>,
}
const DECODED_SHRINK_THRESHOLD: usize = 4096;
impl DecodedPostings {
pub fn new() -> Self {
Self {
docs: Vec::new(),
positions: Vec::new(),
offsets: Vec::new(),
}
}
pub fn clear(&mut self) {
self.docs.clear();
if self.docs.capacity() * mem::size_of::<DecodedDoc>() > DECODED_SHRINK_THRESHOLD {
self.docs = Vec::new();
}
self.positions.clear();
if self.positions.capacity() * mem::size_of::<i32>() > DECODED_SHRINK_THRESHOLD {
self.positions = Vec::new();
}
self.offsets.clear();
if self.offsets.capacity() * mem::size_of::<TermOffset>() > DECODED_SHRINK_THRESHOLD {
self.offsets = Vec::new();
}
}
}
pub(crate) struct BufferedPostingsEnum {
decoded: DecodedPostings,
doc_freq: i32,
total_term_freq: i64,
doc_idx: usize,
pos_idx: usize,
}
impl BufferedPostingsEnum {
pub fn new(decoded: DecodedPostings, has_freqs: bool) -> Self {
let doc_freq = decoded.docs.len() as i32;
let total_term_freq = if has_freqs {
decoded.docs.iter().map(|d| d.freq as i64).sum()
} else {
-1
};
Self {
decoded,
doc_freq,
total_term_freq,
doc_idx: 0,
pos_idx: 0,
}
}
fn current_doc(&self) -> &DecodedDoc {
&self.decoded.docs[self.doc_idx - 1]
}
}
impl PostingsEnumProducer for BufferedPostingsEnum {
fn doc_freq(&self) -> i32 {
self.doc_freq
}
fn total_term_freq(&self) -> i64 {
self.total_term_freq
}
fn next_doc(&mut self) -> io::Result<i32> {
if self.doc_idx >= self.decoded.docs.len() {
return Ok(NO_MORE_DOCS);
}
let doc_id = self.decoded.docs[self.doc_idx].doc_id;
self.pos_idx = 0;
self.doc_idx += 1;
Ok(doc_id)
}
fn freq(&self) -> i32 {
self.current_doc().freq
}
fn next_position(&mut self) -> io::Result<i32> {
let doc = self.current_doc();
let abs_idx = doc.pos_start as usize + self.pos_idx;
let pos = self.decoded.positions[abs_idx];
self.pos_idx += 1;
Ok(pos)
}
fn offset(&self) -> Option<TermOffset> {
if self.decoded.offsets.is_empty() {
return None;
}
let doc = self.current_doc();
let abs_idx = doc.pos_start as usize + self.pos_idx - 1;
Some(self.decoded.offsets[abs_idx])
}
fn payload(&self) -> Option<&[u8]> {
None
}
}
pub(crate) struct FreqProxTermsWriterPerField {
pub base: TermsHashPerField,
pub postings_array: FreqProxPostingsArray,
pub index_options: IndexOptions,
pub has_freq: bool,
pub has_prox: bool,
pub has_offsets: bool,
pub saw_payloads: bool,
pub max_term_frequency: i32,
pub unique_term_count: i32,
pub(crate) current_position: i32,
pub(crate) current_offset: TermOffset,
}
impl FreqProxTermsWriterPerField {
pub(crate) fn new(field_name: String, index_options: IndexOptions) -> Self {
let has_freq = index_options >= IndexOptions::DocsAndFreqs;
let has_prox = index_options >= IndexOptions::DocsAndFreqsAndPositions;
let has_offsets = index_options >= IndexOptions::DocsAndFreqsAndPositionsAndOffsets;
let stream_count = if has_prox { 2 } else { 1 };
let base = TermsHashPerField::new(stream_count, field_name, index_options);
let postings_array = FreqProxPostingsArray::new(2, has_freq, has_prox, has_offsets);
Self {
base,
postings_array,
index_options,
has_freq,
has_prox,
has_offsets,
saw_payloads: false,
max_term_frequency: 0,
unique_term_count: 0,
current_position: 0,
current_offset: TermOffset::default(),
}
}
pub(crate) fn add(
&mut self,
term_byte_pool: &mut ByteBlockPool,
terms_hash: &mut TermsHash,
term_bytes: &[u8],
doc_id: i32,
) -> io::Result<usize> {
let tid = TermsHashPerFieldTrait::add(self, term_byte_pool, terms_hash, term_bytes, doc_id);
Ok(tid)
}
#[cfg(test)]
pub(crate) fn add_at(
&mut self,
term_byte_pool: &mut ByteBlockPool,
terms_hash: &mut TermsHash,
term_bytes: &[u8],
doc_id: i32,
pos: TermPosition,
) -> io::Result<usize> {
self.current_position = pos.position;
self.current_offset = pos.offset;
self.add(term_byte_pool, terms_hash, term_bytes, doc_id)
}
#[expect(dead_code)]
pub(crate) fn reset(&mut self) {
self.base.reset();
}
#[expect(dead_code)]
pub(crate) fn finish(&self) {
}
pub(crate) fn num_terms(&self) -> usize {
self.base.num_terms()
}
pub(crate) fn sort_terms(&mut self, byte_pool: &ByteBlockPool) {
self.base.sort_terms(byte_pool);
}
pub(crate) fn sorted_term_ids(&self) -> &[i32] {
self.base.sorted_term_ids()
}
pub(crate) fn term_bytes<'a>(&self, byte_pool: &'a ByteBlockPool, term_id: usize) -> &'a [u8] {
self.base.term_bytes(byte_pool, term_id)
}
pub(crate) fn flush_pending_docs(&mut self, terms_hash: &mut TermsHash) {
let num_terms = self.base.num_terms();
for term_id in 0..num_terms {
let int_start = self.postings_array.base.address_offset[term_id] as usize;
self.base.stream_address_offset = int_start;
if !self.has_freq {
let code = self.postings_array.last_doc_codes[term_id];
self.base.write_v_int(terms_hash, 0, code);
} else {
let freq = self.postings_array.term_freqs.as_ref().unwrap()[term_id];
let code = self.postings_array.last_doc_codes[term_id];
if freq == 1 {
self.base.write_v_int(terms_hash, 0, code | 1);
} else {
self.base.write_v_int(terms_hash, 0, code);
self.base.write_v_int(terms_hash, 0, freq);
}
}
}
}
pub(crate) fn decode_term_into(
&self,
terms_hash: &TermsHash,
term_id: usize,
buf: &mut DecodedPostings,
) -> io::Result<()> {
use crate::util::byte_block_pool::ByteSliceReader;
buf.clear();
let (start, end) = self.get_stream_range(&terms_hash.int_pool, term_id, 0);
let mut reader = ByteSliceReader::new(&terms_hash.byte_pool, start, end);
let mut pos_reader = if self.has_prox {
let (ps, pe) = self.get_stream_range(&terms_hash.int_pool, term_id, 1);
Some(ByteSliceReader::new(&terms_hash.byte_pool, ps, pe))
} else {
None
};
let mut last_doc_id = 0;
while !reader.eof() {
let code = reader.read_vint()?;
let (doc_delta, freq);
if !self.has_freq {
doc_delta = code;
freq = 1;
} else {
doc_delta = code >> 1;
if (code & 1) != 0 {
freq = 1;
} else {
freq = reader.read_vint()?;
}
}
let doc_id = last_doc_id + doc_delta;
last_doc_id = doc_id;
let pos_start = buf.positions.len() as u32;
if let Some(ref mut pr) = pos_reader {
let mut last_pos = 0;
let mut last_start_offset = 0u32;
for _ in 0..freq {
let prox_code = pr.read_vint()?;
let pos_delta = prox_code >> 1;
let pos = last_pos + pos_delta;
buf.positions.push(pos);
last_pos = pos;
if self.has_offsets {
let start_offset_delta = pr.read_vint()? as u32;
let length = pr.read_vint()? as u16;
let start = last_start_offset + start_offset_delta;
buf.offsets.push(TermOffset { start, length });
last_start_offset = start;
}
}
}
buf.docs.push(DecodedDoc {
doc_id,
freq,
pos_start,
});
}
Ok(())
}
pub(crate) fn get_stream_range(
&self,
int_pool: &[i32],
term_id: usize,
stream: usize,
) -> (usize, usize) {
assert!(stream < self.base.stream_count);
let address_offset = self.postings_array.base.address_offset[term_id] as usize;
let end = int_pool[address_offset + stream] as usize;
let start =
self.postings_array.base.byte_starts[term_id] as usize + stream * FIRST_LEVEL_SIZE;
(start, end)
}
}
impl fmt::Debug for FreqProxTermsWriterPerField {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("FreqProxTermsWriterPerField")
.field("field_name", &self.base.field_name)
.field("num_terms", &self.base.bytes_hash.size())
.field("has_freq", &self.has_freq)
.field("has_prox", &self.has_prox)
.field("has_offsets", &self.has_offsets)
.finish()
}
}
impl mem_dbg::MemSize for FreqProxTermsWriterPerField {
fn mem_size_rec(
&self,
flags: mem_dbg::SizeFlags,
refs: &mut mem_dbg::HashMap<usize, usize>,
) -> usize {
mem::size_of::<Self>()
+ self.base.bytes_hash.mem_size_rec(flags, refs)
+ self.postings_array.size() * self.postings_array.bytes_per_posting()
}
}
impl TermsHashPerFieldTrait for FreqProxTermsWriterPerField {
fn base(&self) -> &TermsHashPerField {
&self.base
}
fn base_mut(&mut self) -> &mut TermsHashPerField {
&mut self.base
}
fn postings_array_base(&self) -> &ParallelPostingsArray {
&self.postings_array.base
}
fn postings_array_base_mut(&mut self) -> &mut ParallelPostingsArray {
&mut self.postings_array.base
}
fn ensure_postings_capacity(&mut self, term_id: usize) {
while term_id >= self.postings_array.size() {
let grown = self.postings_array.grow();
self.postings_array = grown;
}
}
fn new_term(&mut self, terms_hash: &mut TermsHash, term_id: usize, doc_id: i32) {
let position = self.current_position;
let start_offset = self.current_offset.start as i32;
let offset_length = self.current_offset.length;
let postings = &mut self.postings_array;
postings.last_doc_ids[term_id] = doc_id;
if !self.has_freq {
postings.last_doc_codes[term_id] = doc_id;
self.max_term_frequency = self.max_term_frequency.max(1);
} else {
postings.last_doc_codes[term_id] = doc_id << 1;
postings.term_freqs.as_mut().unwrap()[term_id] = 1;
if self.has_prox {
self.base.write_v_int(terms_hash, 1, position << 1);
postings.last_positions.as_mut().unwrap()[term_id] = position;
if self.has_offsets {
postings.last_offsets.as_mut().unwrap()[term_id] = 0;
self.base.write_v_int(terms_hash, 1, start_offset);
self.base.write_v_int(terms_hash, 1, offset_length as i32);
postings.last_offsets.as_mut().unwrap()[term_id] = start_offset;
}
}
self.max_term_frequency = self
.max_term_frequency
.max(postings.term_freqs.as_ref().unwrap()[term_id]);
}
self.unique_term_count += 1;
}
fn add_term(&mut self, terms_hash: &mut TermsHash, term_id: usize, doc_id: i32) {
let position = self.current_position;
let start_offset = self.current_offset.start as i32;
let offset_length = self.current_offset.length;
let postings = &mut self.postings_array;
if !self.has_freq {
if doc_id != postings.last_doc_ids[term_id] {
assert!(doc_id > postings.last_doc_ids[term_id]);
self.base
.write_v_int(terms_hash, 0, postings.last_doc_codes[term_id]);
postings.last_doc_codes[term_id] = doc_id - postings.last_doc_ids[term_id];
postings.last_doc_ids[term_id] = doc_id;
self.unique_term_count += 1;
}
} else if doc_id != postings.last_doc_ids[term_id] {
assert!(doc_id > postings.last_doc_ids[term_id]);
if postings.term_freqs.as_ref().unwrap()[term_id] == 1 {
self.base
.write_v_int(terms_hash, 0, postings.last_doc_codes[term_id] | 1);
} else {
self.base
.write_v_int(terms_hash, 0, postings.last_doc_codes[term_id]);
self.base.write_v_int(
terms_hash,
0,
postings.term_freqs.as_ref().unwrap()[term_id],
);
}
postings.term_freqs.as_mut().unwrap()[term_id] = 1;
self.max_term_frequency = self.max_term_frequency.max(1);
postings.last_doc_codes[term_id] = (doc_id - postings.last_doc_ids[term_id]) << 1;
postings.last_doc_ids[term_id] = doc_id;
if self.has_prox {
self.base.write_v_int(terms_hash, 1, position << 1);
postings.last_positions.as_mut().unwrap()[term_id] = position;
if self.has_offsets {
postings.last_offsets.as_mut().unwrap()[term_id] = 0;
self.base.write_v_int(terms_hash, 1, start_offset);
self.base.write_v_int(terms_hash, 1, offset_length as i32);
postings.last_offsets.as_mut().unwrap()[term_id] = start_offset;
}
}
self.unique_term_count += 1;
} else {
let freq = postings.term_freqs.as_mut().unwrap();
freq[term_id] = freq[term_id]
.checked_add(1)
.expect("term frequency overflow");
self.max_term_frequency = self.max_term_frequency.max(freq[term_id]);
if self.has_prox {
let last_pos = postings.last_positions.as_ref().unwrap()[term_id];
self.base
.write_v_int(terms_hash, 1, (position - last_pos) << 1);
postings.last_positions.as_mut().unwrap()[term_id] = position;
if self.has_offsets {
let last_offset = postings.last_offsets.as_ref().unwrap()[term_id];
self.base
.write_v_int(terms_hash, 1, start_offset - last_offset);
self.base.write_v_int(terms_hash, 1, offset_length as i32);
postings.last_offsets.as_mut().unwrap()[term_id] = start_offset;
}
}
}
}
}
#[cfg(test)]
pub(crate) struct TermPosition {
position: i32,
offset: TermOffset,
}
#[cfg(test)]
impl TermPosition {
fn new(position: i32, start: u32, length: u16) -> Self {
Self {
position,
offset: TermOffset { start, length },
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::util::byte_block_pool::ByteSliceReader;
use assertables::*;
fn read_vint(reader: &mut ByteSliceReader<'_>) -> i32 {
reader.read_vint().unwrap()
}
fn new_term_pool() -> ByteBlockPool {
ByteBlockPool::new(32 * 1024)
}
#[test]
fn test_single_term_single_doc() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
assert_eq!(field.num_terms(), 1);
assert_eq!(field.term_bytes(&term_pool, 0), b"hello");
}
#[test]
fn test_duplicate_term_same_doc() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field = FreqProxTermsWriterPerField::new(
"body".to_string(),
IndexOptions::DocsAndFreqsAndPositions,
);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(1, 6, 5),
)
.unwrap();
assert_eq!(field.num_terms(), 1);
assert_eq!(field.postings_array.term_freqs.as_ref().unwrap()[0], 2);
}
#[test]
fn test_multiple_terms() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"world",
0,
TermPosition::new(1, 6, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(2, 12, 5),
)
.unwrap();
assert_eq!(field.num_terms(), 2);
let hello_id = field.base.bytes_hash.find(&term_pool, b"hello");
assert_ge!(hello_id, 0);
assert_eq!(
field.postings_array.term_freqs.as_ref().unwrap()[hello_id as usize],
2
);
}
#[test]
fn test_term_across_documents() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field = FreqProxTermsWriterPerField::new(
"body".to_string(),
IndexOptions::DocsAndFreqsAndPositions,
);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"world",
0,
TermPosition::new(1, 6, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
1,
TermPosition::new(0, 0, 5),
)
.unwrap();
assert_eq!(field.num_terms(), 2);
let hello_id = field.base.bytes_hash.find(&term_pool, b"hello") as usize;
let (start, end) = field.get_stream_range(&th.int_pool, hello_id, 0);
let mut reader = ByteSliceReader::new(&th.byte_pool, start, end);
let code = read_vint(&mut reader);
assert_eq!(code, 1);
assert!(reader.eof());
}
#[test]
fn test_sort_terms_lexicographic() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"cherry",
0,
TermPosition::new(0, 0, 6u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"apple",
0,
TermPosition::new(1, 7, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"banana",
0,
TermPosition::new(2, 13, 6),
)
.unwrap();
field.sort_terms(&term_pool);
let sorted = field.sorted_term_ids();
assert_len_eq_x!(sorted, 3);
assert_eq!(field.term_bytes(&term_pool, sorted[0] as usize), b"apple");
assert_eq!(field.term_bytes(&term_pool, sorted[1] as usize), b"banana");
assert_eq!(field.term_bytes(&term_pool, sorted[2] as usize), b"cherry");
}
#[test]
fn test_docs_only_no_freq() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field = FreqProxTermsWriterPerField::new("tags".to_string(), IndexOptions::Docs);
field
.add_at(
&mut term_pool,
&mut th,
b"tag1",
0,
TermPosition::new(0, 0, 4u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"tag1",
1,
TermPosition::new(0, 0, 4u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"tag1",
2,
TermPosition::new(0, 0, 4u16),
)
.unwrap();
assert_eq!(field.num_terms(), 1);
assert!(!field.has_freq);
let tid = field.base.bytes_hash.find(&term_pool, b"tag1") as usize;
let (start, end) = field.get_stream_range(&th.int_pool, tid, 0);
let mut reader = ByteSliceReader::new(&th.byte_pool, start, end);
let code0 = read_vint(&mut reader);
assert_eq!(code0, 0); let code1 = read_vint(&mut reader);
assert_eq!(code1, 1); }
#[test]
fn test_positions_stream() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field = FreqProxTermsWriterPerField::new(
"body".to_string(),
IndexOptions::DocsAndFreqsAndPositions,
);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"other",
0,
TermPosition::new(1, 6, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"stuff",
0,
TermPosition::new(2, 12, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(3, 18, 5),
)
.unwrap();
let hello_id = field.base.bytes_hash.find(&term_pool, b"hello") as usize;
let (start, end) = field.get_stream_range(&th.int_pool, hello_id, 1);
let mut reader = ByteSliceReader::new(&th.byte_pool, start, end);
let pos0 = read_vint(&mut reader);
assert_eq!(pos0, 0);
let pos1 = read_vint(&mut reader);
assert_eq!(pos1, 6); }
#[test]
fn test_multi_doc_freq_encoding() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(1, 6, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(2, 12, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
1,
TermPosition::new(0, 0, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
2,
TermPosition::new(0, 0, 5),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
2,
TermPosition::new(1, 6, 5),
)
.unwrap();
let hello_id = field.base.bytes_hash.find(&term_pool, b"hello") as usize;
let (start, end) = field.get_stream_range(&th.int_pool, hello_id, 0);
let mut reader = ByteSliceReader::new(&th.byte_pool, start, end);
let doc0_code = read_vint(&mut reader);
assert_eq!(doc0_code, 0); let doc0_freq = read_vint(&mut reader);
assert_eq!(doc0_freq, 3);
let doc1_code = read_vint(&mut reader);
assert_eq!(doc1_code, 3);
assert!(reader.eof());
}
#[test]
fn test_max_term_frequency_tracking() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"a",
0,
TermPosition::new(0, 0, 1u16),
)
.unwrap();
field
.add_at(&mut term_pool, &mut th, b"b", 0, TermPosition::new(1, 2, 1))
.unwrap();
field
.add_at(&mut term_pool, &mut th, b"a", 0, TermPosition::new(2, 4, 1))
.unwrap();
field
.add_at(&mut term_pool, &mut th, b"a", 0, TermPosition::new(3, 6, 1))
.unwrap();
assert_eq!(field.max_term_frequency, 3); }
#[test]
fn test_unique_term_count_tracking() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"a",
0,
TermPosition::new(0, 0, 1u16),
)
.unwrap();
field
.add_at(&mut term_pool, &mut th, b"b", 0, TermPosition::new(1, 2, 1))
.unwrap();
field
.add_at(&mut term_pool, &mut th, b"a", 0, TermPosition::new(2, 4, 1))
.unwrap();
assert_eq!(field.unique_term_count, 2);
}
#[test]
fn test_oversize_aligns_to_bytes_per_posting() {
let result = oversize(3, 12);
assert_eq!(result % 12, 0);
assert_ge!(result, 3);
}
#[test]
fn test_terms_hash_reset() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
th.reset();
let mut field2 =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
field2
.add_at(
&mut term_pool,
&mut th,
b"world",
0,
TermPosition::new(0, 0, 5),
)
.unwrap();
assert_eq!(field2.num_terms(), 1);
}
#[test]
fn test_terms_hash_default() {
let th = TermsHash::default();
assert_eq!(th.int_pool.len(), 0);
}
#[test]
fn test_terms_hash_debug() {
let th = TermsHash::new();
let debug = format!("{th:?}");
assert_contains!(debug, "TermsHash");
assert_contains!(debug, "int_pool_len");
}
#[test]
fn test_terms_hash_per_field_debug() {
let thpf = TermsHashPerField::new(1, "body".to_string(), IndexOptions::DocsAndFreqs);
let debug = format!("{thpf:?}");
assert_contains!(debug, "TermsHashPerField");
assert_contains!(debug, "body");
}
#[test]
fn test_freq_prox_debug() {
let field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
let debug = format!("{field:?}");
assert_contains!(debug, "FreqProxTermsWriterPerField");
assert_contains!(debug, "body");
assert_contains!(debug, "has_freq");
}
#[test]
fn test_freq_prox_mem_size() {
use mem_dbg::{MemSize, SizeFlags};
let field =
FreqProxTermsWriterPerField::new("body".to_string(), IndexOptions::DocsAndFreqs);
let size = field.mem_size(SizeFlags::CAPACITY);
assert_gt!(size, 0);
}
#[test]
fn test_positions_and_offsets() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field = FreqProxTermsWriterPerField::new(
"body".to_string(),
IndexOptions::DocsAndFreqsAndPositionsAndOffsets,
);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(3, 18, 5),
)
.unwrap();
assert_eq!(field.num_terms(), 1);
assert!(field.has_offsets);
let hello_id = field.base.bytes_hash.find(&term_pool, b"hello") as usize;
let (start, end) = field.get_stream_range(&th.int_pool, hello_id, 1);
let mut reader = ByteSliceReader::new(&th.byte_pool, start, end);
let pos0 = read_vint(&mut reader);
assert_eq!(pos0, 0);
let off0_start = read_vint(&mut reader);
assert_eq!(off0_start, 0);
let off0_len = read_vint(&mut reader);
assert_eq!(off0_len, 5);
let pos1 = read_vint(&mut reader);
assert_eq!(pos1, 6);
let off1_start_delta = read_vint(&mut reader);
assert_eq!(off1_start_delta, 18);
let off1_len = read_vint(&mut reader);
assert_eq!(off1_len, 5);
}
#[test]
fn test_offsets_across_documents() {
let mut term_pool = new_term_pool();
let mut th = TermsHash::new();
let mut field = FreqProxTermsWriterPerField::new(
"body".to_string(),
IndexOptions::DocsAndFreqsAndPositionsAndOffsets,
);
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
0,
TermPosition::new(0, 0, 5u16),
)
.unwrap();
field
.add_at(
&mut term_pool,
&mut th,
b"hello",
1,
TermPosition::new(0, 0, 5),
)
.unwrap();
let hello_id = field.base.bytes_hash.find(&term_pool, b"hello") as usize;
let (start, end) = field.get_stream_range(&th.int_pool, hello_id, 1);
let mut reader = ByteSliceReader::new(&th.byte_pool, start, end);
let pos0 = read_vint(&mut reader);
assert_eq!(pos0, 0);
let off_start = read_vint(&mut reader);
assert_eq!(off_start, 0);
let off_len = read_vint(&mut reader);
assert_eq!(off_len, 5);
let pos1 = read_vint(&mut reader);
assert_eq!(pos1, 0);
let off1_start = read_vint(&mut reader);
assert_eq!(off1_start, 0);
let off1_len = read_vint(&mut reader);
assert_eq!(off1_len, 5);
}
#[test]
fn test_buffered_postings_enum_returns_offsets() {
let mut decoded = DecodedPostings::new();
decoded.docs.push(DecodedDoc {
doc_id: 0,
freq: 2,
pos_start: 0,
});
decoded.positions.extend_from_slice(&[0, 5]);
decoded.offsets.extend_from_slice(&[
TermOffset {
start: 0,
length: 5,
},
TermOffset {
start: 6,
length: 5,
},
]);
let mut pe = BufferedPostingsEnum::new(decoded, true);
assert_eq!(pe.next_doc().unwrap(), 0);
assert_eq!(pe.freq(), 2);
assert_eq!(pe.next_position().unwrap(), 0);
assert_eq!(
pe.offset(),
Some(TermOffset {
start: 0,
length: 5
})
);
assert_eq!(pe.next_position().unwrap(), 5);
assert_eq!(
pe.offset(),
Some(TermOffset {
start: 6,
length: 5
})
);
}
#[test]
fn test_buffered_postings_enum_no_offsets() {
let mut decoded = DecodedPostings::new();
decoded.docs.push(DecodedDoc {
doc_id: 0,
freq: 1,
pos_start: 0,
});
decoded.positions.push(0);
let mut pe = BufferedPostingsEnum::new(decoded, true);
assert_eq!(pe.next_doc().unwrap(), 0);
assert_eq!(pe.next_position().unwrap(), 0);
assert_none!(pe.offset());
}
}