use std::collections::HashMap;
use std::io;
use mem_dbg::MemSize;
use crate::analysis::{Analyzer, TokenRef};
use crate::document::{DocValuesType, Document, Field, FieldValue, IndexOptions, StoredValue};
use crate::index::{FieldInfo, FieldInfos, PointDimensionConfig};
use crate::store;
use crate::util::BytesRef;
use crate::util::byte_block_pool::{
ByteBlockPool, ByteSlicePool, ByteSliceReader, ByteSliceWriter, DirectAllocator,
FIRST_LEVEL_SIZE,
};
#[derive(Clone, Copy, Debug)]
struct FieldMeta {
number: u32,
index_options: IndexOptions,
doc_values_type: DocValuesType,
omit_norms: bool,
has_point_values: bool,
}
impl From<&FieldInfo> for FieldMeta {
fn from(fi: &FieldInfo) -> Self {
Self {
number: fi.number(),
index_options: fi.index_options(),
doc_values_type: fi.doc_values_type(),
omit_norms: fi.omit_norms(),
has_point_values: fi.has_point_values(),
}
}
}
#[derive(Clone, Debug, MemSize)]
pub struct OffsetBuffers {
pub start_offsets: Vec<i32>,
pub end_offsets: Vec<i32>,
}
#[derive(Clone, Copy, Debug, MemSize)]
#[mem_size_flat]
pub struct TermMeta([u8; 3]);
impl TermMeta {
const MAX_TERM_ID: usize = 0xFF_FFFF;
pub fn new(term_id: usize) -> Self {
assert!(
term_id <= Self::MAX_TERM_ID,
"term_id {term_id} exceeds 24-bit max ({})",
Self::MAX_TERM_ID
);
Self([term_id as u8, (term_id >> 8) as u8, (term_id >> 16) as u8])
}
pub fn term_id(self) -> usize {
self.0[0] as usize | (self.0[1] as usize) << 8 | (self.0[2] as usize) << 16
}
}
#[derive(Debug, MemSize)]
pub struct PostingsArray {
byte_pool: ByteBlockPool<DirectAllocator>,
byte_stream_starts: Vec<u32>,
byte_stream_addrs: Vec<u32>,
term_count: usize,
last_doc_ids: Vec<i32>,
current_doc_ids: Vec<i32>,
current_freqs: Vec<i32>,
positions_pool: Option<ByteBlockPool<DirectAllocator>>,
positions_stream_starts: Vec<u32>,
positions_stream_addrs: Vec<u32>,
last_positions: Vec<i32>,
tv_freqs: Option<Vec<i32>>,
tv_positions: Option<Vec<Vec<i32>>>,
tv_start_offsets: Option<Vec<Vec<i32>>>,
tv_end_offsets: Option<Vec<Vec<i32>>>,
has_freqs: bool,
has_positions: bool,
has_offsets: bool,
}
impl PostingsArray {
pub fn new(
has_freqs: bool,
has_positions: bool,
has_offsets: bool,
tv_positions: bool,
tv_offsets: bool,
) -> Self {
let mut byte_pool = ByteBlockPool::new(DirectAllocator);
byte_pool.next_buffer();
Self {
byte_pool,
byte_stream_starts: Vec::new(),
byte_stream_addrs: Vec::new(),
term_count: 0,
last_doc_ids: Vec::new(),
current_doc_ids: Vec::new(),
current_freqs: Vec::new(),
positions_pool: if has_positions {
let mut pool = ByteBlockPool::new(DirectAllocator);
pool.next_buffer();
Some(pool)
} else {
None
},
positions_stream_starts: Vec::new(),
positions_stream_addrs: Vec::new(),
last_positions: Vec::new(),
tv_freqs: if tv_positions || tv_offsets {
Some(Vec::new())
} else {
None
},
tv_positions: if tv_positions { Some(Vec::new()) } else { None },
tv_start_offsets: if tv_offsets { Some(Vec::new()) } else { None },
tv_end_offsets: if tv_offsets { Some(Vec::new()) } else { None },
has_freqs,
has_positions,
has_offsets,
}
}
pub fn add_term(&mut self) -> usize {
let tid = self.term_count;
let local_offset = ByteSlicePool::new_slice(&mut self.byte_pool, FIRST_LEVEL_SIZE);
let global_offset = local_offset as u32 + self.byte_pool.byte_offset() as u32;
self.byte_stream_starts.push(global_offset);
let writer = ByteSliceWriter::new(&self.byte_pool, local_offset);
self.byte_stream_addrs.push(writer.address() as u32);
self.term_count += 1;
self.last_doc_ids.push(0);
self.current_doc_ids.push(-1);
self.current_freqs.push(0);
if let Some(ref mut pool) = self.positions_pool {
let local_offset = ByteSlicePool::new_slice(pool, FIRST_LEVEL_SIZE);
let global_offset = local_offset as u32 + pool.byte_offset() as u32;
self.positions_stream_starts.push(global_offset);
let writer = ByteSliceWriter::new(pool, local_offset);
self.positions_stream_addrs.push(writer.address() as u32);
}
if self.has_positions {
self.last_positions.push(0);
}
if let Some(ref mut v) = self.tv_freqs {
v.push(0);
}
if let Some(ref mut v) = self.tv_positions {
v.push(Vec::new());
}
if let Some(ref mut v) = self.tv_start_offsets {
v.push(Vec::new());
}
if let Some(ref mut v) = self.tv_end_offsets {
v.push(Vec::new());
}
tid
}
#[inline]
pub fn record_occurrence(
&mut self,
tid: usize,
doc_id: i32,
position: i32,
start_offset: i32,
end_offset: i32,
) {
if self.current_doc_ids[tid] != doc_id {
self.start_doc(tid, doc_id);
} else if self.has_freqs {
self.current_freqs[tid] += 1;
}
if self.has_positions {
let pos_delta = position - self.last_positions[tid];
let positions_pool = self.positions_pool.as_mut().unwrap();
let mut writer =
ByteSliceWriter::from_address(self.positions_stream_addrs[tid] as usize);
writer.write_vint(positions_pool, pos_delta);
self.positions_stream_addrs[tid] = writer.address() as u32;
self.last_positions[tid] = position;
if self.has_offsets {
let _ = (start_offset, end_offset);
todo!("offset delta encoding requires per-term last_end_offset tracking");
}
}
}
fn start_doc(&mut self, tid: usize, doc_id: i32) {
if self.current_doc_ids[tid] >= 0 {
self.finalize_current_doc(tid);
}
self.current_doc_ids[tid] = doc_id;
self.current_freqs[tid] = 1;
if self.has_positions {
self.last_positions[tid] = 0;
}
}
pub fn start_doc_explicit(&mut self, tid: usize, doc_id: i32) {
self.start_doc(tid, doc_id);
}
pub fn set_freq(&mut self, tid: usize, freq: i32) {
self.current_freqs[tid] = freq;
}
#[inline]
pub fn record_tv_occurrence(
&mut self,
tid: usize,
position: i32,
start_offset: i32,
end_offset: i32,
) {
self.tv_freqs.as_mut().unwrap()[tid] += 1;
if let Some(ref mut positions) = self.tv_positions {
positions[tid].push(position);
}
if let Some(ref mut start_offsets) = self.tv_start_offsets {
start_offsets[tid].push(start_offset);
}
if let Some(ref mut end_offsets) = self.tv_end_offsets {
end_offsets[tid].push(end_offset);
}
}
pub fn finalize_current_doc(&mut self, tid: usize) {
if self.current_doc_ids[tid] < 0 {
return;
}
let delta = self.current_doc_ids[tid] - self.last_doc_ids[tid];
let mut writer = ByteSliceWriter::from_address(self.byte_stream_addrs[tid] as usize);
writer.write_vint(&mut self.byte_pool, delta);
if self.has_freqs {
writer.write_vint(&mut self.byte_pool, self.current_freqs[tid]);
}
self.byte_stream_addrs[tid] = writer.address() as u32;
self.last_doc_ids[tid] = self.current_doc_ids[tid];
self.current_doc_ids[tid] = -1;
self.current_freqs[tid] = 0;
}
pub fn finalize_all(&mut self) {
for tid in 0..self.current_doc_ids.len() {
self.finalize_current_doc(tid);
}
}
pub fn decode_into(&self, tid: usize, buf: &mut PostingsBuffer) -> io::Result<()> {
buf.clear();
let start = self.byte_stream_starts[tid] as usize;
let end = self.byte_stream_addrs[tid] as usize;
let mut reader = ByteSliceReader::new(&self.byte_pool, start, end);
let mut last_doc_id = 0;
let mut pos_reader = self.positions_pool.as_ref().map(|pool| {
ByteSliceReader::new(
pool,
self.positions_stream_starts[tid] as usize,
self.positions_stream_addrs[tid] as usize,
)
});
while !reader.eof() {
let doc_delta = store::read_vint(&mut reader)?;
let doc_id = last_doc_id + doc_delta;
last_doc_id = doc_id;
let freq = if self.has_freqs {
store::read_vint(&mut reader)?
} else {
1
};
buf.doc_ids.push(doc_id);
buf.freqs.push(freq);
if let Some(ref mut pos_r) = pos_reader {
let start = buf.positions.len();
let mut last_pos = 0;
for _ in 0..freq {
let pos_delta = store::read_vint(pos_r)?;
let pos = last_pos + pos_delta;
buf.positions.push(pos);
last_pos = pos;
if self.has_offsets {
store::read_vint(pos_r)?;
store::read_vint(pos_r)?;
}
}
buf.position_starts.push(start);
}
}
buf.doc_freq = buf.doc_ids.len() as i32;
buf.total_term_freq = buf.freqs.iter().map(|&f| f as i64).sum();
Ok(())
}
pub fn len(&self) -> usize {
self.term_count
}
}
#[derive(Debug)]
pub struct PostingsBuffer {
pub doc_ids: Vec<i32>,
pub freqs: Vec<i32>,
pub positions: Vec<i32>,
pub position_starts: Vec<usize>,
pub total_term_freq: i64,
pub doc_freq: i32,
}
impl PostingsBuffer {
pub fn new() -> Self {
Self {
doc_ids: Vec::new(),
freqs: Vec::new(),
positions: Vec::new(),
position_starts: Vec::new(),
total_term_freq: 0,
doc_freq: 0,
}
}
pub fn clear(&mut self) {
self.doc_ids.clear();
self.freqs.clear();
self.positions.clear();
self.position_starts.clear();
self.total_term_freq = 0;
self.doc_freq = 0;
}
pub fn as_postings_data(&self) -> Vec<(i32, i32, &[i32])> {
self.doc_ids
.iter()
.enumerate()
.map(|(i, &doc_id)| {
let freq = self.freqs[i];
let positions = if self.position_starts.is_empty() {
&[] as &[i32]
} else {
let start = self.position_starts[i];
let end = if i + 1 < self.position_starts.len() {
self.position_starts[i + 1]
} else {
self.positions.len()
};
&self.positions[start..end]
};
(doc_id, freq, positions)
})
.collect()
}
}
#[derive(Debug, MemSize)]
pub struct PerFieldData {
term_ids: HashMap<String, TermMeta>,
pub postings: PostingsArray,
pub doc_values: DocValuesAccumulator,
pub norms: Vec<i64>,
pub norms_docs: Vec<i32>,
pub points: Vec<(i32, Vec<u8>)>,
}
impl PerFieldData {
pub fn new() -> Self {
Self {
term_ids: HashMap::new(),
postings: PostingsArray::new(false, false, false, false, false),
doc_values: DocValuesAccumulator::None,
norms: Vec::new(),
norms_docs: Vec::new(),
points: Vec::new(),
}
}
fn get_or_insert_term(
&mut self,
term: &str,
has_freqs: bool,
has_positions: bool,
has_offsets: bool,
tv_positions: bool,
tv_offsets: bool,
) -> usize {
if let Some(meta) = self.term_ids.get(term) {
return meta.term_id();
}
if self.postings.len() == 0 {
self.postings = PostingsArray::new(
has_freqs,
has_positions,
has_offsets,
tv_positions,
tv_offsets,
);
}
let tid = self.postings.add_term();
self.term_ids.insert(term.to_string(), TermMeta::new(tid));
tid
}
pub fn sorted_postings(&self) -> Vec<(&str, usize)> {
let mut pairs: Vec<(&str, usize)> = self
.term_ids
.iter()
.map(|(term, meta)| (term.as_str(), meta.term_id()))
.collect();
pairs.sort_by(|(a, _), (b, _)| a.as_bytes().cmp(b.as_bytes()));
pairs
}
pub fn has_postings(&self) -> bool {
!self.term_ids.is_empty()
}
fn take_term_vector_data(
&mut self,
field_number: u32,
tv_positions: bool,
tv_offsets: bool,
tv_payloads: bool,
) -> Option<TermVectorField> {
let mut terms: Vec<TermVectorTerm> = Vec::new();
let tv_freqs = self.postings.tv_freqs.as_mut()?;
for (term_text, meta) in &self.term_ids {
let tid = meta.term_id();
if tv_freqs[tid] > 0 {
let positions = self
.postings
.tv_positions
.as_mut()
.map(|v| std::mem::take(&mut v[tid]))
.unwrap_or_default();
let offsets = if let (Some(ref mut starts), Some(ref mut ends)) = (
self.postings.tv_start_offsets.as_mut(),
self.postings.tv_end_offsets.as_mut(),
) {
Some(Box::new(OffsetBuffers {
start_offsets: std::mem::take(&mut starts[tid]),
end_offsets: std::mem::take(&mut ends[tid]),
}))
} else {
None
};
terms.push(TermVectorTerm {
term: term_text.clone(),
freq: tv_freqs[tid],
positions,
offsets,
});
tv_freqs[tid] = 0;
}
}
if terms.is_empty() {
return None;
}
terms.sort_by(|a, b| a.term.as_bytes().cmp(b.term.as_bytes()));
Some(TermVectorField {
field_number,
has_positions: tv_positions,
has_offsets: tv_offsets,
has_payloads: tv_payloads,
terms,
})
}
}
#[cfg(test)]
impl PerFieldData {
pub fn num_terms(&self) -> usize {
self.term_ids.len()
}
pub fn term_id(&self, term: &str) -> Option<usize> {
self.term_ids.get(term).map(|meta| meta.term_id())
}
}
#[derive(Clone, Debug, MemSize)]
pub enum DocValuesAccumulator {
None,
Numeric(Vec<(i32, i64)>),
Binary(Vec<(i32, Vec<u8>)>),
Sorted(Vec<(i32, BytesRef)>),
SortedNumeric(Vec<(i32, Vec<i64>)>),
SortedSet(Vec<(i32, Vec<BytesRef>)>),
}
#[derive(Clone, Debug, MemSize)]
pub struct StoredDoc {
pub fields: Vec<(u32, StoredValue)>, }
#[derive(Clone, Debug, MemSize)]
pub struct TermVectorTerm {
pub term: String,
pub freq: i32,
pub positions: Vec<i32>,
pub offsets: Option<Box<OffsetBuffers>>,
}
#[derive(Clone, Debug, MemSize)]
pub struct TermVectorField {
pub field_number: u32,
pub has_positions: bool,
pub has_offsets: bool,
pub has_payloads: bool,
pub terms: Vec<TermVectorTerm>,
}
#[derive(Clone, Debug, MemSize)]
pub struct TermVectorDoc {
pub fields: Vec<TermVectorField>,
}
#[derive(MemSize)]
pub struct IndexingChain {
per_field: HashMap<String, PerFieldData>,
stored_docs: Vec<StoredDoc>,
term_vector_docs: Vec<TermVectorDoc>,
field_number_counter: u32,
field_infos: HashMap<String, FieldInfo>,
num_docs: i32,
global_field_numbers: HashMap<String, u32>,
lowercase_buf: String,
}
impl Default for IndexingChain {
fn default() -> Self {
Self::new()
}
}
impl IndexingChain {
pub fn new() -> Self {
Self {
per_field: HashMap::new(),
stored_docs: Vec::new(),
term_vector_docs: Vec::new(),
field_number_counter: 0,
field_infos: HashMap::new(),
num_docs: 0,
global_field_numbers: HashMap::new(),
lowercase_buf: String::new(),
}
}
pub fn with_global_field_numbers(
global_field_numbers: HashMap<String, u32>,
next_field_number: u32,
) -> Self {
Self {
per_field: HashMap::new(),
stored_docs: Vec::new(),
term_vector_docs: Vec::new(),
field_number_counter: next_field_number,
field_infos: HashMap::new(),
num_docs: 0,
global_field_numbers,
lowercase_buf: String::new(),
}
}
pub fn field_number_mappings(&self) -> impl Iterator<Item = (&str, u32)> {
self.field_infos
.iter()
.map(|(name, fi)| (name.as_str(), fi.number()))
}
pub fn per_field(&self) -> &HashMap<String, PerFieldData> {
&self.per_field
}
pub fn stored_docs(&self) -> &[StoredDoc] {
&self.stored_docs
}
pub fn term_vector_docs(&self) -> &[TermVectorDoc] {
&self.term_vector_docs
}
pub fn take_last_tv_doc(&mut self) -> Option<TermVectorDoc> {
self.term_vector_docs.pop()
}
pub fn num_docs(&self) -> i32 {
self.num_docs
}
pub fn finalize_pending_postings(&mut self) {
for pf in self.per_field.values_mut() {
pf.postings.finalize_all();
}
}
pub fn ram_bytes_used(&self) -> usize {
self.mem_size(mem_dbg::SizeFlags::CAPACITY)
}
pub fn log_ram_breakdown(&self, label: &str) {
let flags = mem_dbg::SizeFlags::CAPACITY;
let total = self.mem_size(flags);
let mut postings_bytes = 0usize;
let mut byte_streams_bytes = 0usize;
let mut position_streams_bytes = 0usize;
let mut term_ids_bytes = 0usize;
let mut dv_bytes = 0usize;
let mut norms_bytes = 0usize;
let mut points_bytes = 0usize;
for pfd in self.per_field.values() {
term_ids_bytes += pfd.term_ids.mem_size(flags);
postings_bytes += pfd.postings.mem_size(flags);
byte_streams_bytes += pfd.postings.byte_pool.mem_size(flags)
+ pfd.postings.byte_stream_starts.mem_size(flags)
+ pfd.postings.byte_stream_addrs.mem_size(flags);
position_streams_bytes += pfd.postings.positions_pool.mem_size(flags)
+ pfd.postings.positions_stream_starts.mem_size(flags)
+ pfd.postings.positions_stream_addrs.mem_size(flags);
dv_bytes += pfd.doc_values.mem_size(flags);
norms_bytes += pfd.norms.mem_size(flags) + pfd.norms_docs.mem_size(flags);
points_bytes += pfd.points.mem_size(flags);
}
let stored_bytes = self.stored_docs.mem_size(flags);
let tv_bytes = self.term_vector_docs.mem_size(flags);
let field_infos_bytes = self.field_infos.mem_size(flags);
log::info!(
"RAM[{}] total={} | postings={} (byte_streams={} position_streams={}) term_ids={} stored={} tv={} dv={} norms={} points={} field_infos={} | docs={}",
label,
fmt_bytes(total),
fmt_bytes(postings_bytes),
fmt_bytes(byte_streams_bytes),
fmt_bytes(position_streams_bytes),
fmt_bytes(term_ids_bytes),
fmt_bytes(stored_bytes),
fmt_bytes(tv_bytes),
fmt_bytes(dv_bytes),
fmt_bytes(norms_bytes),
fmt_bytes(points_bytes),
fmt_bytes(field_infos_bytes),
self.num_docs,
);
}
pub fn process_document(&mut self, doc: Document, analyzer: &dyn Analyzer) -> io::Result<()> {
let doc_id = self.num_docs;
let mut stored_fields: Vec<(u32, StoredValue)> = Vec::new();
let mut lowercase_buf = std::mem::take(&mut self.lowercase_buf);
let mut tv_fields: Vec<TermVectorField> = Vec::new();
for mut field in doc.fields {
let meta = self.get_or_create_field_meta(&field);
let per_field = if let Some(pf) = self.per_field.get_mut(field.name()) {
pf
} else {
self.per_field
.entry(field.name().to_string())
.or_insert_with(PerFieldData::new)
};
if meta.index_options != IndexOptions::None
&& let Some(tvf) = Self::process_indexed_field(
per_field,
&meta,
&mut field,
doc_id,
analyzer,
&mut lowercase_buf,
)?
{
tv_fields.push(tvf);
}
if let Some(stored) = field.stored_value() {
stored_fields.push((meta.number, stored));
}
if meta.doc_values_type != DocValuesType::None {
Self::process_doc_values(per_field, &meta, &field, doc_id);
}
if meta.has_point_values
&& let Some(point_bytes) = field.point_bytes()
{
per_field.points.push((doc_id, point_bytes));
}
}
self.lowercase_buf = lowercase_buf;
self.stored_docs.push(StoredDoc {
fields: stored_fields,
});
self.term_vector_docs
.push(TermVectorDoc { fields: tv_fields });
self.num_docs += 1;
Ok(())
}
fn get_or_create_field_meta(&mut self, field: &Field) -> FieldMeta {
if let Some(fi) = self.field_infos.get(field.name()) {
return FieldMeta::from(fi);
}
let number = if let Some(&num) = self.global_field_numbers.get(field.name()) {
num
} else {
let num = self.field_number_counter;
self.field_number_counter += 1;
num
};
let ft = field.field_type();
let point_config = PointDimensionConfig {
dimension_count: ft.point_dimension_count(),
index_dimension_count: ft.point_index_dimension_count(),
num_bytes: ft.point_num_bytes(),
};
let mut fi = FieldInfo::new(
field.name().to_string(),
number,
ft.store_term_vectors(),
ft.omit_norms(),
ft.index_options(),
ft.doc_values_type(),
point_config,
);
if ft.index_options() != IndexOptions::None {
fi.put_attribute(
"PerFieldPostingsFormat.format".to_string(),
"Lucene103".to_string(),
);
fi.put_attribute("PerFieldPostingsFormat.suffix".to_string(), "0".to_string());
}
if ft.doc_values_type() != DocValuesType::None {
fi.put_attribute(
"PerFieldDocValuesFormat.format".to_string(),
"Lucene90".to_string(),
);
fi.put_attribute(
"PerFieldDocValuesFormat.suffix".to_string(),
"0".to_string(),
);
}
let meta = FieldMeta::from(&fi);
self.field_infos.insert(field.name().to_string(), fi);
meta
}
fn process_indexed_field(
per_field: &mut PerFieldData,
meta: &FieldMeta,
field: &mut Field,
doc_id: i32,
analyzer: &dyn Analyzer,
buf: &mut String,
) -> io::Result<Option<TermVectorField>> {
let has_positions = meta.index_options >= IndexOptions::DocsAndFreqsAndPositions;
let has_offsets = meta.index_options >= IndexOptions::DocsAndFreqsAndPositionsAndOffsets;
let has_freqs = meta.index_options >= IndexOptions::DocsAndFreqs;
let store_tv = field.field_type().store_term_vectors();
let tv_positions = field.field_type().store_term_vector_positions();
let tv_offsets = field.field_type().store_term_vector_offsets();
let tv_payloads = field.field_type().store_term_vector_payloads();
let mut position: i32 = -1;
let mut field_length: i32 = 0;
let mut record_token = |per_field: &mut PerFieldData, token_ref: TokenRef<'_>| {
position += token_ref.position_increment as i32;
field_length += 1;
let tid = per_field.get_or_insert_term(
token_ref.text,
has_freqs,
has_positions,
has_offsets,
tv_positions,
tv_offsets,
);
per_field.postings.record_occurrence(
tid,
doc_id,
position,
token_ref.start_offset as i32,
token_ref.end_offset as i32,
);
if store_tv {
per_field.postings.record_tv_occurrence(
tid,
position,
token_ref.start_offset as i32,
token_ref.end_offset as i32,
);
}
};
if let FieldValue::Feature { term, freq } = field.value() {
let tid = per_field.get_or_insert_term(term, has_freqs, false, false, false, false);
per_field.postings.start_doc_explicit(tid, doc_id);
per_field.postings.set_freq(tid, *freq);
return Ok(None);
}
if field.field_type().tokenized() {
match field.value() {
FieldValue::Text(text) => {
analyzer.analyze_to(text, buf, &mut |tr| {
record_token(per_field, tr);
});
}
FieldValue::Reader(_) => {
let FieldValue::Reader(mut reader) =
std::mem::replace(field.value_mut(), FieldValue::Text(String::new()))
else {
unreachable!()
};
analyzer.analyze_reader(&mut *reader, buf, &mut |tr| {
record_token(per_field, tr);
})?;
}
_ => return Ok(None),
}
} else {
let text = match field.value() {
FieldValue::Text(text) => text,
_ => return Ok(None),
};
position += 1;
field_length = 1;
let tid = per_field.get_or_insert_term(
text,
has_freqs,
has_positions,
has_offsets,
tv_positions,
tv_offsets,
);
per_field
.postings
.record_occurrence(tid, doc_id, position, 0, text.len() as i32);
if store_tv {
per_field
.postings
.record_tv_occurrence(tid, position, 0, text.len() as i32);
}
}
if !meta.omit_norms && field_length > 0 {
let norm = compute_norm(field_length);
per_field.norms.push(norm);
per_field.norms_docs.push(doc_id);
}
if store_tv {
return Ok(per_field.take_term_vector_data(
meta.number,
tv_positions,
tv_offsets,
tv_payloads,
));
}
Ok(None)
}
fn process_doc_values(
per_field: &mut PerFieldData,
meta: &FieldMeta,
field: &Field,
doc_id: i32,
) {
match meta.doc_values_type {
DocValuesType::Numeric => {
if let Some(v) = field.numeric_value() {
if let DocValuesAccumulator::Numeric(ref mut vals) = per_field.doc_values {
vals.push((doc_id, v));
} else {
per_field.doc_values = DocValuesAccumulator::Numeric(vec![(doc_id, v)]);
}
}
}
DocValuesType::Binary => {
if let FieldValue::Bytes(b) = field.value() {
if let DocValuesAccumulator::Binary(ref mut vals) = per_field.doc_values {
vals.push((doc_id, b.clone()));
} else {
per_field.doc_values =
DocValuesAccumulator::Binary(vec![(doc_id, b.clone())]);
}
}
}
DocValuesType::Sorted => match field.value() {
FieldValue::Bytes(b) => {
let term = BytesRef::new(b.clone());
if let DocValuesAccumulator::Sorted(ref mut vals) = per_field.doc_values {
vals.push((doc_id, term));
} else {
per_field.doc_values = DocValuesAccumulator::Sorted(vec![(doc_id, term)]);
}
}
FieldValue::Text(s) => {
let term = BytesRef::from_utf8(s);
if let DocValuesAccumulator::Sorted(ref mut vals) = per_field.doc_values {
vals.push((doc_id, term));
} else {
per_field.doc_values = DocValuesAccumulator::Sorted(vec![(doc_id, term)]);
}
}
_ => {}
},
DocValuesType::SortedNumeric => {
if let Some(v) = field.numeric_value() {
if let DocValuesAccumulator::SortedNumeric(ref mut vals) = per_field.doc_values
{
if let Some(last) = vals.last_mut().filter(|(id, _)| *id == doc_id) {
last.1.push(v);
} else {
vals.push((doc_id, vec![v]));
}
} else {
per_field.doc_values =
DocValuesAccumulator::SortedNumeric(vec![(doc_id, vec![v])]);
}
}
}
DocValuesType::SortedSet => {
if let FieldValue::Text(s) = field.value() {
let term = BytesRef::from_utf8(s);
if let DocValuesAccumulator::SortedSet(ref mut vals) = per_field.doc_values {
if let Some(last) = vals.last_mut().filter(|(id, _)| *id == doc_id) {
last.1.push(term);
} else {
vals.push((doc_id, vec![term]));
}
} else {
per_field.doc_values =
DocValuesAccumulator::SortedSet(vec![(doc_id, vec![term])]);
}
}
}
DocValuesType::None => {}
}
}
pub fn build_field_infos(&self) -> FieldInfos {
let mut fields: Vec<FieldInfo> = self.field_infos.values().cloned().collect();
fields.sort_by_key(|fi| fi.number());
FieldInfos::new(fields)
}
}
fn fmt_bytes(bytes: usize) -> String {
if bytes < 1024 {
format!("{bytes}B")
} else if bytes < 1024 * 1024 {
format!("{:.1}KB", bytes as f64 / 1024.0)
} else {
format!("{:.2}MB", bytes as f64 / (1024.0 * 1024.0))
}
}
fn compute_norm(field_length: i32) -> i64 {
encode_norm_value(field_length) as i8 as i64
}
fn encode_norm_value(length: i32) -> u8 {
int_to_byte4(length)
}
fn long_to_int4(i: i64) -> i32 {
assert!(i >= 0);
let num_bits = 64 - (i as u64).leading_zeros();
if num_bits < 4 {
i as i32
} else {
let shift = num_bits - 4;
let mut encoded = (i as u64 >> shift) as i32;
encoded &= 0x07;
encoded |= (shift as i32 + 1) << 3;
encoded
}
}
const MAX_INT4: u32 = {
231
};
const NUM_FREE_VALUES: u32 = 255 - MAX_INT4;
fn int_to_byte4(i: i32) -> u8 {
if i < 0 {
return 0;
}
if (i as u32) < NUM_FREE_VALUES {
i as u8
} else {
(NUM_FREE_VALUES + long_to_int4(i as i64 - NUM_FREE_VALUES as i64) as u32) as u8
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::analysis::standard::StandardAnalyzer;
use crate::document;
#[test]
fn test_term_meta_roundtrip() {
let meta = TermMeta::new(0);
assert_eq!(meta.term_id(), 0);
let meta = TermMeta::new(12345);
assert_eq!(meta.term_id(), 12345);
let meta = TermMeta::new(TermMeta::MAX_TERM_ID);
assert_eq!(meta.term_id(), TermMeta::MAX_TERM_ID);
}
#[test]
#[should_panic(expected = "exceeds 24-bit max")]
fn test_term_meta_overflow_panics() {
TermMeta::new(TermMeta::MAX_TERM_ID + 1);
}
fn make_analyzer() -> StandardAnalyzer {
StandardAnalyzer::new()
}
fn decode_term(pfd: &PerFieldData, term: &str) -> PostingsBuffer {
let tid = pfd
.term_id(term)
.unwrap_or_else(|| panic!("term not found: {term}"));
let mut buf = PostingsBuffer::new();
pfd.postings.decode_into(tid, &mut buf).unwrap();
buf
}
#[test]
fn test_process_single_document() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::keyword_field("path", "/foo/bar.txt"));
doc.add(document::long_field("modified", 1000));
doc.add(document::text_field("contents", "hello world"));
chain.process_document(doc, &analyzer).unwrap();
assert_eq!(chain.num_docs(), 1);
assert_len_eq_x!(&chain.stored_docs(), 1);
chain.finalize_pending_postings();
let path_data = chain.per_field().get("path").unwrap();
assert_eq!(path_data.num_terms(), 1);
let buf = decode_term(path_data, "/foo/bar.txt");
assert_eq!(buf.doc_freq, 1);
assert_eq!(buf.doc_ids[0], 0);
let contents_data = chain.per_field().get("contents").unwrap();
assert_eq!(contents_data.num_terms(), 2);
assert_some!(contents_data.term_id("hello"));
assert_some!(contents_data.term_id("world"));
assert_len_eq_x!(&contents_data.norms, 1);
assert_eq!(contents_data.norms_docs[0], 0);
assert_is_empty!(path_data.norms);
let modified_data = chain.per_field().get("modified").unwrap();
assert_len_eq_x!(&modified_data.points, 1);
assert_eq!(modified_data.points[0].0, 0);
assert_len_eq_x!(&chain.stored_docs()[0].fields, 1);
}
#[test]
fn test_process_multiple_documents() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc1 = Document::new();
doc1.add(document::keyword_field("path", "/a.txt"));
doc1.add(document::text_field("contents", "hello world"));
let mut doc2 = Document::new();
doc2.add(document::keyword_field("path", "/b.txt"));
doc2.add(document::text_field("contents", "hello rust"));
chain.process_document(doc1, &analyzer).unwrap();
chain.process_document(doc2, &analyzer).unwrap();
chain.finalize_pending_postings();
assert_eq!(chain.num_docs(), 2);
let contents_data = chain.per_field().get("contents").unwrap();
let hello = decode_term(contents_data, "hello");
assert_eq!(hello.doc_freq, 2);
assert_eq!(hello.doc_ids[0], 0);
assert_eq!(hello.doc_ids[1], 1);
let world = decode_term(contents_data, "world");
assert_eq!(world.doc_freq, 1);
assert_eq!(world.doc_ids[0], 0);
let rust_buf = decode_term(contents_data, "rust");
assert_eq!(rust_buf.doc_freq, 1);
assert_eq!(rust_buf.doc_ids[0], 1);
}
#[test]
fn test_positions_tracked() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field("contents", "hello world hello"));
chain.process_document(doc, &analyzer).unwrap();
chain.finalize_pending_postings();
let contents = chain.per_field().get("contents").unwrap();
let hello = decode_term(contents, "hello");
assert_eq!(hello.freqs[0], 2);
assert_len_eq_x!(&hello.positions, 2);
assert_eq!(hello.positions[0], 0);
assert_eq!(hello.positions[1], 2);
}
#[test]
fn test_field_infos_built() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::keyword_field("path", "/foo.txt"));
doc.add(document::long_field("modified", 100));
doc.add(document::text_field("contents", "hello"));
chain.process_document(doc, &analyzer).unwrap();
let fis = chain.build_field_infos();
assert_len_eq_x!(&fis, 3);
assert!(fis.has_postings());
assert!(fis.has_doc_values());
assert!(fis.has_point_values());
assert!(fis.has_norms());
}
#[test]
fn test_doc_values_sorted_numeric() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::long_field("modified", 42));
chain.process_document(doc, &analyzer).unwrap();
let modified = chain.per_field().get("modified").unwrap();
if let DocValuesAccumulator::SortedNumeric(ref vals) = modified.doc_values {
assert_len_eq_x!(&vals, 1);
assert_eq!(vals[0], (0, vec![42]));
} else {
panic!("expected SortedNumeric");
}
}
#[test]
fn test_doc_values_sorted_set() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::keyword_field("path", "/foo.txt"));
chain.process_document(doc, &analyzer).unwrap();
let path_data = chain.per_field().get("path").unwrap();
if let DocValuesAccumulator::SortedSet(ref vals) = path_data.doc_values {
assert_len_eq_x!(&vals, 1);
assert_eq!(vals[0].0, 0);
assert_eq!(vals[0].1[0], BytesRef::from_utf8("/foo.txt"));
} else {
panic!("expected SortedSet");
}
}
#[test]
fn test_doc_values_numeric() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::numeric_doc_values_field("count", 99));
chain.process_document(doc, &analyzer).unwrap();
let count_data = chain.per_field().get("count").unwrap();
if let DocValuesAccumulator::Numeric(ref vals) = count_data.doc_values {
assert_len_eq_x!(&vals, 1);
assert_eq!(vals[0], (0, 99));
} else {
panic!("expected Numeric");
}
}
#[test]
fn test_doc_values_binary() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::binary_doc_values_field("payload", vec![1, 2, 3]));
chain.process_document(doc, &analyzer).unwrap();
let payload_data = chain.per_field().get("payload").unwrap();
if let DocValuesAccumulator::Binary(ref vals) = payload_data.doc_values {
assert_len_eq_x!(&vals, 1);
assert_eq!(vals[0], (0, vec![1, 2, 3]));
} else {
panic!("expected Binary");
}
}
#[test]
fn test_doc_values_sorted() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::sorted_doc_values_field("category", b"animals"));
chain.process_document(doc, &analyzer).unwrap();
let cat_data = chain.per_field().get("category").unwrap();
if let DocValuesAccumulator::Sorted(ref vals) = cat_data.doc_values {
assert_len_eq_x!(&vals, 1);
assert_eq!(vals[0].0, 0);
assert_eq!(vals[0].1, BytesRef::new(b"animals".to_vec()));
} else {
panic!("expected Sorted");
}
}
#[test]
fn test_doc_values_numeric_multiple_docs() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
for i in 0..3 {
let mut doc = Document::new();
doc.add(document::numeric_doc_values_field("count", i * 10));
chain.process_document(doc, &analyzer).unwrap();
}
let count_data = chain.per_field().get("count").unwrap();
if let DocValuesAccumulator::Numeric(ref vals) = count_data.doc_values {
assert_len_eq_x!(&vals, 3);
assert_eq!(vals[0], (0, 0));
assert_eq!(vals[1], (1, 10));
assert_eq!(vals[2], (2, 20));
} else {
panic!("expected Numeric");
}
}
#[test]
#[should_panic(expected = "offset delta encoding requires")]
fn test_postings_array_with_offsets_panics() {
let mut pa = PostingsArray::new(true, true, true, false, false);
let tid = pa.add_term();
pa.record_occurrence(tid, 0, 0, 0, 5);
}
#[test]
fn test_indexing_chain_default() {
let chain = IndexingChain::default();
assert_eq!(chain.num_docs(), 0);
}
#[test]
fn test_doc_values_numeric_appends_to_existing() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::numeric_doc_values_field("count", 10));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::numeric_doc_values_field("count", 20));
chain.process_document(doc, &analyzer).unwrap();
let data = chain.per_field().get("count").unwrap();
if let DocValuesAccumulator::Numeric(ref vals) = data.doc_values {
assert_eq!(vals, &[(0, 10), (1, 20)]);
} else {
panic!("expected Numeric");
}
}
#[test]
fn test_doc_values_binary_appends_to_existing() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::binary_doc_values_field("payload", vec![1]));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::binary_doc_values_field("payload", vec![2]));
chain.process_document(doc, &analyzer).unwrap();
let data = chain.per_field().get("payload").unwrap();
if let DocValuesAccumulator::Binary(ref vals) = data.doc_values {
assert_len_eq_x!(&vals, 2);
assert_eq!(vals[1], (1, vec![2]));
} else {
panic!("expected Binary");
}
}
#[test]
fn test_doc_values_sorted_appends_to_existing() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::sorted_doc_values_field("cat", b"a"));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::sorted_doc_values_field("cat", b"b"));
chain.process_document(doc, &analyzer).unwrap();
let data = chain.per_field().get("cat").unwrap();
if let DocValuesAccumulator::Sorted(ref vals) = data.doc_values {
assert_len_eq_x!(&vals, 2);
} else {
panic!("expected Sorted");
}
}
#[test]
fn test_doc_values_sorted_set_appends_to_existing() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::sorted_set_doc_values_field("tag", "rust"));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::sorted_set_doc_values_field("tag", "java"));
chain.process_document(doc, &analyzer).unwrap();
let data = chain.per_field().get("tag").unwrap();
if let DocValuesAccumulator::SortedSet(ref vals) = data.doc_values {
assert_len_eq_x!(&vals, 2);
} else {
panic!("expected SortedSet");
}
}
#[test]
fn test_doc_values_sorted_numeric_appends_to_existing() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::sorted_numeric_doc_values_field("ts", 100));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::sorted_numeric_doc_values_field("ts", 200));
chain.process_document(doc, &analyzer).unwrap();
let data = chain.per_field().get("ts").unwrap();
if let DocValuesAccumulator::SortedNumeric(ref vals) = data.doc_values {
assert_len_eq_x!(&vals, 2);
} else {
panic!("expected SortedNumeric");
}
}
#[test]
fn test_encode_norm_subnormal_and_negative() {
assert_eq!(encode_norm_value(0), 0);
assert_eq!(encode_norm_value(-1), 0);
for len in 1..=15 {
assert_ne!(encode_norm_value(len), 0);
}
}
#[test]
fn test_long_to_int4_subnormal() {
assert_eq!(long_to_int4(0), 0);
assert_eq!(long_to_int4(1), 1);
assert_eq!(long_to_int4(7), 7);
let normal = long_to_int4(8);
assert_gt!(normal, 7);
}
#[test]
fn test_process_doc_values_sorted_text_variant() {
let meta = FieldMeta {
number: 0,
index_options: IndexOptions::None,
doc_values_type: DocValuesType::Sorted,
omit_norms: true,
has_point_values: false,
};
let field1 = crate::document::Field::new(
"category".to_string(),
crate::document::FieldTypeBuilder::new().build(),
crate::document::FieldValue::Text("animals".to_string()),
);
let field2 = crate::document::Field::new(
"category".to_string(),
crate::document::FieldTypeBuilder::new().build(),
crate::document::FieldValue::Text("plants".to_string()),
);
let mut per_field = PerFieldData::new();
IndexingChain::process_doc_values(&mut per_field, &meta, &field1, 0);
IndexingChain::process_doc_values(&mut per_field, &meta, &field2, 1);
if let DocValuesAccumulator::Sorted(ref vals) = per_field.doc_values {
assert_len_eq_x!(&vals, 2);
assert_eq!(vals[0].1, BytesRef::from_utf8("animals"));
assert_eq!(vals[1].1, BytesRef::from_utf8("plants"));
} else {
panic!("expected Sorted");
}
}
#[test]
fn test_field_infos_have_per_field_codec_attributes() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::keyword_field("path", "/foo.txt"));
doc.add(document::long_field("modified", 1000));
doc.add(document::text_field("contents", "hello world"));
chain.process_document(doc, &analyzer).unwrap();
let fis = chain.build_field_infos();
let path_fi = fis.field_info_by_name("path").unwrap();
assert_eq!(
path_fi.get_attribute("PerFieldPostingsFormat.format"),
Some("Lucene103"),
);
assert_eq!(
path_fi.get_attribute("PerFieldPostingsFormat.suffix"),
Some("0"),
);
let mod_fi = fis.field_info_by_name("modified").unwrap();
assert_eq!(
mod_fi.get_attribute("PerFieldDocValuesFormat.format"),
Some("Lucene90"),
);
assert!(
mod_fi
.get_attribute("PerFieldPostingsFormat.format")
.is_none(),
"non-indexed field should not have PerFieldPostingsFormat attribute"
);
let cont_fi = fis.field_info_by_name("contents").unwrap();
assert_eq!(
cont_fi.get_attribute("PerFieldPostingsFormat.format"),
Some("Lucene103"),
);
}
#[test]
fn test_encode_norm_value() {
assert_ne!(encode_norm_value(1), 0);
assert_ne!(encode_norm_value(10), 0);
assert_ne!(encode_norm_value(100), 0);
assert_eq!(encode_norm_value(0), 0);
}
#[test]
fn test_ram_bytes_used_increases_with_docs() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let ram_empty = chain.ram_bytes_used();
let mut doc = Document::new();
doc.add(document::keyword_field("path", "/foo.txt"));
doc.add(document::long_field("modified", 1000));
doc.add(document::text_field("contents", "hello world"));
chain.process_document(doc, &analyzer).unwrap();
let ram_after_one = chain.ram_bytes_used();
assert!(
ram_after_one > ram_empty,
"RAM should grow after one doc: empty={ram_empty}, after_one={ram_after_one}"
);
for i in 0..10 {
let mut doc = Document::new();
doc.add(document::keyword_field("path", &format!("/{i}.txt")));
doc.add(document::long_field("modified", i as i64 * 100));
doc.add(document::text_field(
"contents",
&format!("document number {i} with some text content"),
));
chain.process_document(doc, &analyzer).unwrap();
}
let ram_after_many = chain.ram_bytes_used();
assert!(
ram_after_many > ram_after_one,
"RAM should grow with more docs: after_one={ram_after_one}, after_many={ram_after_many}"
);
}
#[test]
fn test_postings_array_byte_stream_compact() {
let mut pa = PostingsArray::new(true, true, false, false, false);
let tid = pa.add_term();
for doc_id in 0..10 {
pa.record_occurrence(tid, doc_id, 0, 0, 0);
pa.record_occurrence(tid, doc_id, 5, 0, 0);
}
pa.finalize_current_doc(tid);
let mut buf = PostingsBuffer::new();
pa.decode_into(tid, &mut buf).unwrap();
assert_len_eq_x!(&buf.doc_ids, 10);
for (i, &doc_id) in buf.doc_ids.iter().enumerate() {
assert_eq!(doc_id, i as i32);
assert_eq!(buf.freqs[i], 2);
}
let byte_len = pa.byte_stream_addrs[tid] as usize - pa.byte_stream_starts[tid] as usize;
let pos_len =
pa.positions_stream_addrs[tid] as usize - pa.positions_stream_starts[tid] as usize;
let total_bytes = byte_len + pos_len;
assert!(
total_bytes < 100,
"combined streams should be compact, got {total_bytes} bytes",
);
}
#[test]
fn test_process_int_field() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::int_field("size", 42, true));
chain.process_document(doc, &analyzer).unwrap();
let size_data = chain.per_field().get("size").unwrap();
assert_len_eq_x!(&size_data.points, 1);
assert_len_eq_x!(&size_data.points[0].1, 4);
if let DocValuesAccumulator::SortedNumeric(ref vals) = size_data.doc_values {
assert_len_eq_x!(&vals, 1);
assert_eq!(vals[0].1, vec![42]);
} else {
panic!("expected SortedNumeric for IntField");
}
assert_len_eq_x!(&chain.stored_docs()[0].fields, 1);
}
#[test]
fn test_process_float_field() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::float_field("score", 1.5, true));
chain.process_document(doc, &analyzer).unwrap();
let score_data = chain.per_field().get("score").unwrap();
assert_len_eq_x!(&score_data.points, 1);
assert_len_eq_x!(&score_data.points[0].1, 4);
if let DocValuesAccumulator::SortedNumeric(ref vals) = score_data.doc_values {
assert_len_eq_x!(&vals, 1);
let expected = crate::encoding::sortable_bytes::float_to_int(1.5) as i64;
assert_eq!(vals[0].1, vec![expected]);
} else {
panic!("expected SortedNumeric for FloatField");
}
}
#[test]
fn test_process_double_field() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::double_field("rating", 9.87, false));
chain.process_document(doc, &analyzer).unwrap();
let rating_data = chain.per_field().get("rating").unwrap();
assert_len_eq_x!(&rating_data.points, 1);
assert_len_eq_x!(&rating_data.points[0].1, 8);
if let DocValuesAccumulator::SortedNumeric(ref vals) = rating_data.doc_values {
assert_len_eq_x!(&vals, 1);
let expected = crate::encoding::sortable_bytes::double_to_long(9.87);
assert_eq!(vals[0].1, vec![expected]);
} else {
panic!("expected SortedNumeric for DoubleField");
}
assert_is_empty!(chain.stored_docs()[0].fields);
}
#[test]
fn test_process_string_field() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::string_field("title", "hello world", true));
chain.process_document(doc, &analyzer).unwrap();
chain.finalize_pending_postings();
let title_data = chain.per_field().get("title").unwrap();
assert_eq!(title_data.num_terms(), 1);
assert_some!(title_data.term_id("hello world"));
assert_matches!(title_data.doc_values, DocValuesAccumulator::None);
assert_len_eq_x!(&chain.stored_docs()[0].fields, 1);
}
#[test]
fn test_process_stored_only_fields() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::stored_string_field("notes", "test"));
doc.add(document::stored_int_field("extra_int", 99));
doc.add(document::stored_float_field("extra_float", 1.5));
doc.add(document::stored_double_field("extra_double", 2.5));
chain.process_document(doc, &analyzer).unwrap();
assert_len_eq_x!(&chain.stored_docs()[0].fields, 4);
for field_name in &["notes", "extra_int", "extra_float", "extra_double"] {
let data = chain.per_field().get(*field_name).unwrap();
assert!(!data.has_postings());
assert_is_empty!(data.points);
assert_matches!(data.doc_values, DocValuesAccumulator::None);
}
}
#[test]
fn test_reader_field_produces_same_postings_as_text_field() {
let text = "the quick brown fox jumps over the lazy dog";
let mut chain_text = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field("contents", text));
chain_text.process_document(doc, &analyzer).unwrap();
chain_text.finalize_pending_postings();
let mut chain_reader = IndexingChain::new();
let mut doc = Document::new();
doc.add(document::text_field_reader(
"contents",
std::io::Cursor::new(text.as_bytes().to_vec()),
));
chain_reader.process_document(doc, &analyzer).unwrap();
chain_reader.finalize_pending_postings();
let pf_text = chain_text.per_field().get("contents").unwrap();
let pf_reader = chain_reader.per_field().get("contents").unwrap();
assert_eq!(pf_text.num_terms(), pf_reader.num_terms());
let mut buf_text = PostingsBuffer::new();
let mut buf_reader = PostingsBuffer::new();
for (term, tid_text) in pf_text.sorted_postings() {
let tid_reader = pf_reader.term_id(term).unwrap_or_else(|| {
panic!("reader chain missing term: {term}");
});
pf_text
.postings
.decode_into(tid_text, &mut buf_text)
.unwrap();
pf_reader
.postings
.decode_into(tid_reader, &mut buf_reader)
.unwrap();
assert_len_eq!(
&buf_text.doc_ids,
&buf_reader.doc_ids,
"doc count mismatch for term: {term}"
);
for i in 0..buf_text.doc_ids.len() {
assert_eq!(
buf_text.doc_ids[i], buf_reader.doc_ids[i],
"doc_id mismatch for term: {term}"
);
assert_eq!(
buf_text.freqs[i], buf_reader.freqs[i],
"freq mismatch for term: {term}"
);
}
assert_eq!(
buf_text.positions, buf_reader.positions,
"positions mismatch for term: {term}"
);
}
assert_eq!(pf_text.norms, pf_reader.norms);
}
#[test]
fn test_large_reader_field_multi_chunk() {
let text = "the quick brown fox jumps over the lazy dog ".repeat(1000);
assert_gt!(text.len(), 32_000);
let mut chain_text = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field("contents", &text));
chain_text.process_document(doc, &analyzer).unwrap();
chain_text.finalize_pending_postings();
let mut chain_reader = IndexingChain::new();
let mut doc = Document::new();
doc.add(document::text_field_reader(
"contents",
std::io::Cursor::new(text.as_bytes().to_vec()),
));
chain_reader.process_document(doc, &analyzer).unwrap();
chain_reader.finalize_pending_postings();
let pf_text = chain_text.per_field().get("contents").unwrap();
let pf_reader = chain_reader.per_field().get("contents").unwrap();
assert_eq!(pf_text.num_terms(), pf_reader.num_terms());
let mut buf_text = PostingsBuffer::new();
let mut buf_reader = PostingsBuffer::new();
for (term, tid_text) in pf_text.sorted_postings() {
let tid_reader = pf_reader.term_id(term).unwrap_or_else(|| {
panic!("reader chain missing term: {term}");
});
pf_text
.postings
.decode_into(tid_text, &mut buf_text)
.unwrap();
pf_reader
.postings
.decode_into(tid_reader, &mut buf_reader)
.unwrap();
assert_len_eq!(
&buf_text.doc_ids,
&buf_reader.doc_ids,
"doc count mismatch for term: {term}"
);
for i in 0..buf_text.doc_ids.len() {
assert_eq!(
buf_text.doc_ids[i], buf_reader.doc_ids[i],
"doc_id mismatch for term: {term}"
);
assert_eq!(
buf_text.freqs[i], buf_reader.freqs[i],
"freq mismatch for term: {term}"
);
}
assert_eq!(
buf_text.positions, buf_reader.positions,
"positions mismatch for term: {term}"
);
}
assert_eq!(pf_text.norms, pf_reader.norms);
}
#[test]
fn test_term_vectors_basic() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field_with_term_vectors(
"contents",
"hello world",
));
chain.process_document(doc, &analyzer).unwrap();
let tv_docs = chain.term_vector_docs();
assert_len_eq_x!(&tv_docs, 1);
let tv_doc = &tv_docs[0];
assert_len_eq_x!(&tv_doc.fields, 1);
let tv_field = &tv_doc.fields[0];
assert!(tv_field.has_positions);
assert!(tv_field.has_offsets);
assert!(!tv_field.has_payloads);
assert_len_eq_x!(&tv_field.terms, 2);
let hello = &tv_field.terms[0];
assert_eq!(hello.term, "hello");
assert_eq!(hello.freq, 1);
assert_eq!(hello.positions, vec![0]);
let hello_offsets = hello.offsets.as_ref().unwrap();
assert_eq!(hello_offsets.start_offsets, vec![0]);
assert_eq!(hello_offsets.end_offsets, vec![5]);
let world = &tv_field.terms[1];
assert_eq!(world.term, "world");
assert_eq!(world.freq, 1);
assert_eq!(world.positions, vec![1]);
let world_offsets = world.offsets.as_ref().unwrap();
assert_eq!(world_offsets.start_offsets, vec![6]);
assert_eq!(world_offsets.end_offsets, vec![11]);
}
#[test]
fn test_term_vectors_sorted_by_utf8_bytes() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field_with_term_vectors(
"contents",
"banana apple cherry",
));
chain.process_document(doc, &analyzer).unwrap();
let terms = &chain.term_vector_docs()[0].fields[0].terms;
let term_texts: Vec<&str> = terms.iter().map(|t| t.term.as_str()).collect();
assert_eq!(term_texts, vec!["apple", "banana", "cherry"]);
}
#[test]
fn test_term_vectors_repeated_term() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field_with_term_vectors(
"contents",
"hello world hello",
));
chain.process_document(doc, &analyzer).unwrap();
let terms = &chain.term_vector_docs()[0].fields[0].terms;
let hello = terms.iter().find(|t| t.term == "hello").unwrap();
assert_eq!(hello.freq, 2);
assert_eq!(hello.positions, vec![0, 2]);
let hello_offsets = hello.offsets.as_ref().unwrap();
assert_eq!(hello_offsets.start_offsets, vec![0, 12]);
assert_eq!(hello_offsets.end_offsets, vec![5, 17]);
}
#[test]
fn test_term_vectors_no_tv_fields_empty_doc() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field("contents", "hello world"));
chain.process_document(doc, &analyzer).unwrap();
let tv_docs = chain.term_vector_docs();
assert_len_eq_x!(&tv_docs, 1);
assert_is_empty!(tv_docs[0].fields);
}
#[test]
fn test_term_vectors_multi_doc_alignment() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field_with_term_vectors("contents", "hello"));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::text_field("contents", "world"));
chain.process_document(doc, &analyzer).unwrap();
let mut doc = Document::new();
doc.add(document::text_field_with_term_vectors("contents", "rust"));
chain.process_document(doc, &analyzer).unwrap();
let tv_docs = chain.term_vector_docs();
assert_len_eq_x!(&tv_docs, 3);
assert_len_eq_x!(&tv_docs[0].fields, 1);
assert_is_empty!(tv_docs[1].fields);
assert_len_eq_x!(&tv_docs[2].fields, 1);
}
#[test]
fn test_term_vectors_feature_field_excluded() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::feature_field("features", "pagerank", 1.0));
chain.process_document(doc, &analyzer).unwrap();
let tv_docs = chain.term_vector_docs();
assert_len_eq_x!(&tv_docs, 1);
assert_is_empty!(tv_docs[0].fields);
}
#[test]
fn test_term_vectors_positions_only() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let ft = crate::document::FieldTypeBuilder::new()
.index_options(IndexOptions::DocsAndFreqsAndPositions)
.tokenized(true)
.store_term_vectors(true)
.store_term_vector_positions(true)
.build();
let field = crate::document::Field::new(
"contents".to_string(),
ft,
crate::document::FieldValue::Text("hello world".to_string()),
);
let mut doc = Document::new();
doc.add(field);
chain.process_document(doc, &analyzer).unwrap();
let tv_field = &chain.term_vector_docs()[0].fields[0];
assert!(tv_field.has_positions);
assert!(!tv_field.has_offsets);
let hello = &tv_field.terms[0];
assert_eq!(hello.term, "hello");
assert_eq!(hello.positions, vec![0]);
assert_none!(hello.offsets);
}
#[test]
fn test_term_vectors_keyword_with_tv() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let ft = crate::document::FieldTypeBuilder::new()
.index_options(IndexOptions::DocsAndFreqs)
.tokenized(false)
.omit_norms(true)
.store_term_vectors(true)
.store_term_vector_positions(true)
.store_term_vector_offsets(true)
.build();
let field = crate::document::Field::new(
"tag".to_string(),
ft,
crate::document::FieldValue::Text("/foo/bar.txt".to_string()),
);
let mut doc = Document::new();
doc.add(field);
chain.process_document(doc, &analyzer).unwrap();
let tv_field = &chain.term_vector_docs()[0].fields[0];
assert_len_eq_x!(&tv_field.terms, 1);
let term = &tv_field.terms[0];
assert_eq!(term.term, "/foo/bar.txt");
assert_eq!(term.freq, 1);
assert_eq!(term.positions, vec![0]);
let term_offsets = term.offsets.as_ref().unwrap();
assert_eq!(term_offsets.start_offsets, vec![0]);
assert_eq!(term_offsets.end_offsets, vec![12]);
}
#[test]
fn test_term_vectors_mixed_fields() {
let mut chain = IndexingChain::new();
let analyzer = make_analyzer();
let mut doc = Document::new();
doc.add(document::text_field_with_term_vectors("body", "hello"));
doc.add(document::text_field("title", "world"));
chain.process_document(doc, &analyzer).unwrap();
let tv_doc = &chain.term_vector_docs()[0];
assert_len_eq_x!(&tv_doc.fields, 1);
let body_fi = chain.build_field_infos();
let body_number = body_fi.field_info_by_name("body").unwrap().number();
assert_eq!(tv_doc.fields[0].field_number, body_number);
}
#[test]
fn measure_tv_ram_overhead() {
let analyzer = make_analyzer();
let text = "hello world foo bar baz alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu nu xi omicron pi rho sigma tau upsilon phi chi psi omega ".repeat(60);
for n in [1, 10, 50] {
let mut chain_no_tv = IndexingChain::new();
let mut chain_tv = IndexingChain::new();
for _ in 0..n {
let mut doc = Document::new();
doc.add(document::text_field("contents", &text));
chain_no_tv.process_document(doc, &analyzer).unwrap();
let mut doc2 = Document::new();
doc2.add(document::text_field_with_term_vectors("contents", &text));
chain_tv.process_document(doc2, &analyzer).unwrap();
}
let no_tv = chain_no_tv.ram_bytes_used();
let tv = chain_tv.ram_bytes_used();
eprintln!(
"{n:>3} docs: no_tv={no_tv:>8}, tv={tv:>8}, overhead={:>8} ({:.1}x)",
tv - no_tv,
tv as f64 / no_tv as f64
);
}
}
}