use common::read_u32_vint;
use stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::postings::FieldSerializer;
use crate::DocId;
const POSITION_END: u32 = 0;
#[derive(Default)]
pub(crate) struct BufferLender {
buffer_u8: Vec<u8>,
buffer_u32: Vec<u32>,
buffer_offsets_from: Vec<u32>,
buffer_offsets_to: Vec<u32>,
}
impl BufferLender {
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
self.buffer_u8.clear();
&mut self.buffer_u8
}
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
self.buffer_u8.clear();
self.buffer_u32.clear();
(&mut self.buffer_u8, &mut self.buffer_u32)
}
pub fn lend_all_with_offsets(
&mut self,
) -> (&mut Vec<u8>, &mut Vec<u32>, &mut Vec<u32>, &mut Vec<u32>) {
self.buffer_u8.clear();
self.buffer_u32.clear();
self.buffer_offsets_from.clear();
self.buffer_offsets_to.clear();
(
&mut self.buffer_u8,
&mut self.buffer_u32,
&mut self.buffer_offsets_from,
&mut self.buffer_offsets_to,
)
}
}
pub struct VInt32Reader<'a> {
data: &'a [u8],
}
impl<'a> VInt32Reader<'a> {
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
VInt32Reader { data }
}
}
impl Iterator for VInt32Reader<'_> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.data.is_empty() {
None
} else {
Some(read_u32_vint(&mut self.data))
}
}
}
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
fn current_doc(&self) -> u32;
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena);
fn record_position(&mut self, position: u32, arena: &mut MemoryArena);
fn record_position_with_offsets(
&mut self,
position: u32,
_offset_from: u32,
_offset_to: u32,
arena: &mut MemoryArena,
) {
self.record_position(position, arena);
}
fn close_doc(&mut self, arena: &mut MemoryArena);
fn serialize(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
);
fn term_doc_freq(&self) -> Option<u32>;
#[inline]
fn has_term_freq(&self) -> bool {
true
}
}
#[derive(Clone, Copy, Default)]
pub struct DocIdRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl Recorder for DocIdRecorder {
#[inline]
fn current_doc(&self) -> DocId {
self.current_doc
}
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
let delta = doc - self.current_doc;
self.current_doc = doc;
self.stack.writer(arena).write_u32_vint(delta);
}
#[inline]
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {}
#[inline]
fn close_doc(&mut self, _arena: &mut MemoryArena) {}
fn serialize(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(arena, buffer);
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
for doc_id in iter {
serializer.write_doc(doc_id, 0u32, &[][..]);
}
}
fn term_doc_freq(&self) -> Option<u32> {
None
}
fn has_term_freq(&self) -> bool {
false
}
}
fn get_sum_reader(iter: impl Iterator<Item = u32>) -> impl Iterator<Item = u32> {
iter.scan(0, |state, delta| {
*state += delta;
Some(*state)
})
}
#[derive(Clone, Copy, Default)]
pub struct TermFrequencyRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
current_tf: u32,
term_doc_freq: u32,
}
impl Recorder for TermFrequencyRecorder {
#[inline]
fn current_doc(&self) -> DocId {
self.current_doc
}
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
let delta = doc - self.current_doc;
self.term_doc_freq += 1;
self.current_doc = doc;
self.stack.writer(arena).write_u32_vint(delta);
}
#[inline]
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {
self.current_tf += 1;
}
#[inline]
fn close_doc(&mut self, arena: &mut MemoryArena) {
debug_assert!(self.current_tf > 0);
self.stack.writer(arena).write_u32_vint(self.current_tf);
self.current_tf = 0;
}
fn serialize(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(arena, buffer);
let mut u32_it = VInt32Reader::new(&buffer[..]);
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
prev_doc = doc_id;
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc_id, term_freq, &[][..]);
}
}
fn term_doc_freq(&self) -> Option<u32> {
Some(self.term_doc_freq)
}
}
#[derive(Clone, Copy, Default)]
pub struct TfAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
term_doc_freq: u32,
}
impl Recorder for TfAndPositionRecorder {
#[inline]
fn current_doc(&self) -> DocId {
self.current_doc
}
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
let delta = doc - self.current_doc;
self.current_doc = doc;
self.term_doc_freq += 1u32;
self.stack.writer(arena).write_u32_vint(delta);
}
#[inline]
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
self.stack
.writer(arena)
.write_u32_vint(position.wrapping_add(1u32));
}
#[inline]
fn close_doc(&mut self, arena: &mut MemoryArena) {
self.stack.writer(arena).write_u32_vint(POSITION_END);
}
fn serialize(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
self.stack.read_to_end(arena, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
prev_doc = doc_id;
let mut prev_position_plus_one = 1u32;
buffer_positions.clear();
loop {
match u32_it.next() {
Some(POSITION_END) | None => {
break;
}
Some(position_plus_one) => {
let delta_position = position_plus_one - prev_position_plus_one;
buffer_positions.push(delta_position);
prev_position_plus_one = position_plus_one;
}
}
}
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
}
}
fn term_doc_freq(&self) -> Option<u32> {
Some(self.term_doc_freq)
}
}
#[derive(Clone, Copy, Default)]
pub struct TfPositionAndOffsetRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
term_doc_freq: u32,
}
impl Recorder for TfPositionAndOffsetRecorder {
#[inline]
fn current_doc(&self) -> DocId {
self.current_doc
}
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
let delta = doc - self.current_doc;
self.current_doc = doc;
self.term_doc_freq += 1u32;
self.stack.writer(arena).write_u32_vint(delta);
}
#[inline]
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
let mut writer = self.stack.writer(arena);
writer.write_u32_vint(position.wrapping_add(1u32));
writer.write_u32_vint(1u32); writer.write_u32_vint(1u32); }
#[inline]
fn record_position_with_offsets(
&mut self,
position: u32,
offset_from: u32,
offset_to: u32,
arena: &mut MemoryArena,
) {
let mut writer = self.stack.writer(arena);
writer.write_u32_vint(position.wrapping_add(1u32));
writer.write_u32_vint(offset_from.wrapping_add(1u32));
writer.write_u32_vint(offset_to.wrapping_add(1u32));
}
#[inline]
fn close_doc(&mut self, arena: &mut MemoryArena) {
self.stack.writer(arena).write_u32_vint(POSITION_END);
}
fn serialize(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let (buffer_u8, buffer_positions, buffer_offsets_from, buffer_offsets_to) =
buffer_lender.lend_all_with_offsets();
self.stack.read_to_end(arena, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
prev_doc = doc_id;
let mut prev_position_plus_one = 1u32;
let mut prev_offset_from = 0u32;
let mut prev_offset_to = 0u32;
buffer_positions.clear();
buffer_offsets_from.clear();
buffer_offsets_to.clear();
loop {
match u32_it.next() {
Some(POSITION_END) | None => {
break;
}
Some(position_plus_one) => {
let delta_position = position_plus_one - prev_position_plus_one;
buffer_positions.push(delta_position);
prev_position_plus_one = position_plus_one;
let offset_from_plus_one = u32_it.next().unwrap_or(1);
let offset_to_plus_one = u32_it.next().unwrap_or(1);
let offset_from = offset_from_plus_one.wrapping_sub(1);
let offset_to = offset_to_plus_one.wrapping_sub(1);
let delta_offset_from = offset_from - prev_offset_from;
let delta_offset_to = offset_to - prev_offset_to;
buffer_offsets_from.push(delta_offset_from);
buffer_offsets_to.push(delta_offset_to);
prev_offset_from = offset_from;
prev_offset_to = offset_to;
}
}
}
serializer.write_doc_with_offsets(
doc_id,
buffer_positions.len() as u32,
buffer_positions,
buffer_offsets_from,
buffer_offsets_to,
);
}
}
fn term_doc_freq(&self) -> Option<u32> {
Some(self.term_doc_freq)
}
}
#[cfg(test)]
mod tests {
use common::write_u32_vint;
use super::{BufferLender, VInt32Reader};
#[test]
fn test_buffer_lender() {
let mut buffer_lender = BufferLender::default();
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
}
#[test]
fn test_vint_u32() {
let mut buffer = vec![];
let vals = [0, 1, 324_234_234, u32::MAX];
for &i in &vals {
assert!(write_u32_vint(i, &mut buffer).is_ok());
}
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
assert_eq!(&res[..], &vals[..]);
}
}