use std::collections::{BTreeSet, HashMap};
use std::io;
use std::mem;
use log::debug;
use crate::codecs::lucene90::doc_values_producer::DocValuesProducer;
use crate::codecs::lucene90::indexed_disi;
use crate::codecs::packed_writers::{DirectMonotonicWriter, DirectWriter};
use crate::codecs::{codec_footers, codec_headers};
use crate::document::DocValuesType;
use crate::encoding::lz4::{self, FastHashTable};
use crate::encoding::packed::unsigned_bits_required;
use crate::encoding::write_encoding::WriteEncoding;
use crate::index::FieldInfo;
use crate::index::index_file_names;
use crate::search::doc_id_set_iterator::NO_MORE_DOCS;
use crate::store::memory::MemoryIndexOutput;
use crate::store::{DataOutput, Directory, IndexOutput, VecOutput};
use crate::util::string_helper;
#[derive(mem_dbg::MemSize)]
#[mem_size(flat)]
pub(crate) struct NumericDocValue {
pub doc_id: i32,
pub value: i64,
}
#[derive(mem_dbg::MemSize)]
pub(crate) struct BinaryDocValue {
pub doc_id: i32,
pub value: Vec<u8>,
}
#[derive(mem_dbg::MemSize)]
pub(crate) struct SortedDocValue {
pub doc_id: i32,
pub value: Vec<u8>,
}
#[derive(mem_dbg::MemSize)]
pub(crate) struct SortedNumericDocValue {
pub doc_id: i32,
pub values: Vec<i64>,
}
#[derive(mem_dbg::MemSize)]
pub(crate) struct SortedSetDocValue {
pub doc_id: i32,
pub values: Vec<Vec<u8>>,
}
#[derive(mem_dbg::MemSize)]
pub(crate) enum DocValuesAccumulator {
Numeric(Vec<NumericDocValue>),
Binary(Vec<BinaryDocValue>),
Sorted(Vec<SortedDocValue>),
SortedNumeric(Vec<SortedNumericDocValue>),
SortedSet(Vec<SortedSetDocValue>),
}
pub(crate) struct DocValuesFieldData {
pub name: String,
pub number: u32,
pub doc_values_type: DocValuesType,
pub doc_values: DocValuesAccumulator,
}
pub(crate) const DATA_EXTENSION: &str = "dvd";
pub(crate) const META_EXTENSION: &str = "dvm";
pub(crate) const DATA_CODEC: &str = "Lucene90DocValuesData";
pub(crate) const META_CODEC: &str = "Lucene90DocValuesMetadata";
pub(crate) const VERSION: i32 = 0;
pub(crate) const NUMERIC: u8 = 0;
pub(crate) const BINARY: u8 = 1;
pub(crate) const SORTED: u8 = 2;
pub(crate) const SORTED_SET: u8 = 3;
pub(crate) const SORTED_NUMERIC: u8 = 4;
const TERMS_DICT_BLOCK_LZ4_SHIFT: usize = 6;
const TERMS_DICT_BLOCK_LZ4_SIZE: usize = 1 << TERMS_DICT_BLOCK_LZ4_SHIFT;
const TERMS_DICT_BLOCK_LZ4_MASK: usize = TERMS_DICT_BLOCK_LZ4_SIZE - 1;
const TERMS_DICT_REVERSE_INDEX_SHIFT: i32 = 10;
const TERMS_DICT_REVERSE_INDEX_SIZE: usize = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
const TERMS_DICT_REVERSE_INDEX_MASK: usize = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
pub(crate) const DIRECT_MONOTONIC_BLOCK_SHIFT: u32 = 16;
pub(crate) fn write(
directory: &dyn Directory,
segment_name: &str,
segment_suffix: &str,
segment_id: &[u8; 16],
field_infos: &[&FieldInfo],
producer: &dyn DocValuesProducer,
num_docs: i32,
) -> io::Result<Vec<String>> {
let dvm_name =
index_file_names::segment_file_name(segment_name, segment_suffix, META_EXTENSION);
let dvd_name =
index_file_names::segment_file_name(segment_name, segment_suffix, DATA_EXTENSION);
let mut meta = directory.create_output(&dvm_name)?;
let mut data = directory.create_output(&dvd_name)?;
codec_headers::write_index_header(&mut *meta, META_CODEC, VERSION, segment_id, segment_suffix)?;
codec_headers::write_index_header(&mut *data, DATA_CODEC, VERSION, segment_id, segment_suffix)?;
for &field_info in field_infos {
meta.write_le_int(field_info.number() as i32)?;
match field_info.doc_values_type() {
DocValuesType::Numeric => {
meta.write_byte(NUMERIC)?;
let vals = collect_numeric(producer, field_info)?;
debug!(
"doc_values: field={:?} (#{}) -> NUMERIC, {} docs",
field_info.name(),
field_info.number(),
vals.len()
);
add_numeric_field(&mut *meta, &mut *data, &vals, num_docs)?;
}
DocValuesType::Binary => {
meta.write_byte(BINARY)?;
let vals = collect_binary(producer, field_info)?;
debug!(
"doc_values: field={:?} (#{}) -> BINARY, {} docs",
field_info.name(),
field_info.number(),
vals.len()
);
add_binary_field(&mut *meta, &mut *data, &vals, num_docs)?;
}
DocValuesType::Sorted => {
meta.write_byte(SORTED)?;
debug!(
"doc_values: field={:?} (#{}) -> SORTED",
field_info.name(),
field_info.number(),
);
add_sorted_field(&mut *meta, &mut *data, producer, field_info, num_docs)?;
}
DocValuesType::SortedNumeric => {
meta.write_byte(SORTED_NUMERIC)?;
let vals = collect_sorted_numeric(producer, field_info)?;
debug!(
"doc_values: field={:?} (#{}) -> SORTED_NUMERIC, {} docs",
field_info.name(),
field_info.number(),
vals.len()
);
add_sorted_numeric_field(&mut *meta, &mut *data, &vals, num_docs)?;
}
DocValuesType::SortedSet => {
meta.write_byte(SORTED_SET)?;
debug!(
"doc_values: field={:?} (#{}) -> SORTED_SET",
field_info.name(),
field_info.number(),
);
add_sorted_set_field(&mut *meta, &mut *data, producer, field_info, num_docs)?;
}
DocValuesType::None => continue,
}
}
meta.write_le_int(-1)?;
codec_footers::write_footer(&mut *meta)?;
codec_footers::write_footer(&mut *data)?;
Ok(vec![dvm_name, dvd_name])
}
fn collect_numeric(
producer: &dyn DocValuesProducer,
field_info: &FieldInfo,
) -> io::Result<Vec<NumericDocValue>> {
let mut iter = match producer.get_numeric(field_info)? {
Some(iter) => iter,
None => return Ok(vec![]),
};
let mut vals = Vec::new();
loop {
let doc = iter.next_doc()?;
if doc == NO_MORE_DOCS {
break;
}
vals.push(NumericDocValue {
doc_id: doc,
value: iter.long_value()?,
});
}
Ok(vals)
}
fn collect_binary(
producer: &dyn DocValuesProducer,
field_info: &FieldInfo,
) -> io::Result<Vec<BinaryDocValue>> {
let mut iter = match producer.get_binary(field_info)? {
Some(iter) => iter,
None => return Ok(vec![]),
};
let mut vals = Vec::new();
loop {
let doc = iter.next_doc()?;
if doc == NO_MORE_DOCS {
break;
}
vals.push(BinaryDocValue {
doc_id: doc,
value: iter.binary_value()?.to_vec(),
});
}
Ok(vals)
}
fn collect_sorted_numeric(
producer: &dyn DocValuesProducer,
field_info: &FieldInfo,
) -> io::Result<Vec<SortedNumericDocValue>> {
let mut iter = match producer.get_sorted_numeric(field_info)? {
Some(iter) => iter,
None => return Ok(vec![]),
};
let mut vals = Vec::new();
loop {
let doc = iter.next_doc()?;
if doc == NO_MORE_DOCS {
break;
}
let count = iter.doc_value_count();
let mut values = Vec::with_capacity(count as usize);
for _ in 0..count {
values.push(iter.next_value()?);
}
vals.push(SortedNumericDocValue {
doc_id: doc,
values,
});
}
Ok(vals)
}
fn add_numeric_field(
meta: &mut dyn DataOutput,
data: &mut dyn IndexOutput,
vals: &[NumericDocValue],
num_docs: i32,
) -> io::Result<()> {
let doc_ids: Vec<i32> = vals.iter().map(|entry| entry.doc_id).collect();
let all_values: Vec<i64> = vals.iter().map(|entry| entry.value).collect();
write_values(
meta,
data,
&all_values,
&doc_ids,
vals.len() as i32,
num_docs,
false,
)?;
Ok(())
}
fn add_binary_field(
mut meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
vals: &[BinaryDocValue],
num_docs: i32,
) -> io::Result<()> {
let start = data.file_pointer() as i64;
meta.write_le_long(start)?;
let mut min_length = i32::MAX;
let mut max_length = 0i32;
for entry in vals {
let len = entry.value.len() as i32;
min_length = min_length.min(len);
max_length = max_length.max(len);
data.write_all(&entry.value)?;
}
let num_docs_with_field = vals.len() as i32;
if num_docs_with_field == 0 {
min_length = 0;
}
meta.write_le_long(data.file_pointer() as i64 - start)?;
if num_docs_with_field == 0 {
meta.write_le_long(-2)?; meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?;
} else if num_docs_with_field == num_docs {
meta.write_le_long(-1)?; meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?;
} else {
let doc_ids: Vec<i32> = vals.iter().map(|entry| entry.doc_id).collect();
let disi_offset = data.file_pointer() as i64;
meta.write_le_long(disi_offset)?;
let jump_table_entry_count = indexed_disi::write_bit_set(&doc_ids, num_docs, &mut *data)?;
meta.write_le_long(data.file_pointer() as i64 - disi_offset)?;
meta.write_le_short(jump_table_entry_count)?;
meta.write_byte(indexed_disi::DEFAULT_DENSE_RANK_POWER as u8)?;
}
meta.write_le_int(num_docs_with_field)?;
meta.write_le_int(min_length)?;
meta.write_le_int(max_length)?;
if max_length > min_length {
let addresses_start = data.file_pointer() as i64;
meta.write_le_long(addresses_start)?;
meta.write_vint(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_binary_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut cumulative: i64 = 0;
for entry in vals {
dm_writer.add(cumulative);
cumulative += entry.value.len() as i64;
}
dm_writer.add(cumulative);
dm_writer.finish(meta, &mut address_buffer)?;
data.write_all(address_buffer.bytes())?;
meta.write_le_long(data.file_pointer() as i64 - addresses_start)?;
}
Ok(())
}
fn add_sorted_field(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
producer: &dyn DocValuesProducer,
field_info: &FieldInfo,
num_docs: i32,
) -> io::Result<()> {
let mut sorted_dv = match producer.get_sorted(field_info)? {
Some(dv) => dv,
None => {
write_values(meta, data, &[], &[], 0, num_docs, true)?;
add_terms_dict(meta, data, &[])?;
return Ok(());
}
};
let value_count = sorted_dv.value_count();
let mut sorted_terms: Vec<Vec<u8>> = Vec::with_capacity(value_count as usize);
for ord in 0..value_count {
sorted_terms.push(sorted_dv.lookup_ord(ord)?.to_vec());
}
let mut doc_ids = Vec::new();
let mut ordinals = Vec::new();
loop {
let doc = sorted_dv.next_doc()?;
if doc == NO_MORE_DOCS {
break;
}
doc_ids.push(doc);
ordinals.push(sorted_dv.ord_value()? as i64);
}
write_values(
meta,
data,
&ordinals,
&doc_ids,
doc_ids.len() as i32,
num_docs,
true,
)?;
let term_refs: Vec<&[u8]> = sorted_terms.iter().map(|t| t.as_slice()).collect();
add_terms_dict(meta, data, &term_refs)?;
Ok(())
}
fn add_sorted_numeric_field(
mut meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
vals: &[SortedNumericDocValue],
num_docs: i32,
) -> io::Result<()> {
let mut sorted_vals: Vec<SortedNumericDocValue> = vals
.iter()
.map(|entry| SortedNumericDocValue {
doc_id: entry.doc_id,
values: entry.values.clone(),
})
.collect();
for entry in sorted_vals.iter_mut() {
entry.values.sort();
}
let doc_ids: Vec<i32> = sorted_vals.iter().map(|entry| entry.doc_id).collect();
let all_values: Vec<i64> = sorted_vals
.iter()
.flat_map(|entry| entry.values.iter().copied())
.collect();
let num_docs_with_value = sorted_vals.len() as i32;
let (num_docs_with_field, num_values) = write_values(
meta,
data,
&all_values,
&doc_ids,
num_docs_with_value,
num_docs,
false,
)?;
meta.write_le_int(num_docs_with_field)?;
if num_values > num_docs_with_field as i64 {
let addresses_start = data.file_pointer() as i64;
meta.write_le_long(addresses_start)?;
meta.write_vint(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_sn_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut cumulative: i64 = 0;
for entry in &sorted_vals {
dm_writer.add(cumulative);
cumulative += entry.values.len() as i64;
}
dm_writer.add(cumulative);
dm_writer.finish(meta, &mut address_buffer)?;
data.write_all(address_buffer.bytes())?;
meta.write_le_long(data.file_pointer() as i64 - addresses_start)?;
}
Ok(())
}
fn add_sorted_set_field(
mut meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
producer: &dyn DocValuesProducer,
field_info: &FieldInfo,
num_docs: i32,
) -> io::Result<()> {
let mut sorted_set_dv = match producer.get_sorted_set(field_info)? {
Some(dv) => dv,
None => {
meta.write_byte(0)?;
write_values(meta, data, &[], &[], 0, num_docs, true)?;
add_terms_dict(meta, data, &[])?;
return Ok(());
}
};
let value_count = sorted_set_dv.value_count();
let mut sorted_terms: Vec<Vec<u8>> = Vec::with_capacity(value_count as usize);
for ord in 0..value_count {
sorted_terms.push(sorted_set_dv.lookup_ord(ord)?.to_vec());
}
let term_refs: Vec<&[u8]> = sorted_terms.iter().map(|t| t.as_slice()).collect();
let mut is_single_valued = true;
let mut ord_vals: Vec<(i32, Vec<i64>)> = Vec::new();
loop {
let doc = sorted_set_dv.next_doc()?;
if doc == NO_MORE_DOCS {
break;
}
let count = sorted_set_dv.doc_value_count();
if count > 1 {
is_single_valued = false;
}
let mut ords = Vec::with_capacity(count as usize);
for _ in 0..count {
ords.push(sorted_set_dv.next_ord()?);
}
ord_vals.push((doc, ords));
}
if is_single_valued {
meta.write_byte(0)?;
let doc_ids: Vec<i32> = ord_vals.iter().map(|(doc_id, _)| *doc_id).collect();
let ordinals: Vec<i64> = ord_vals.iter().map(|(_, ords)| ords[0]).collect();
write_values(
meta,
data,
&ordinals,
&doc_ids,
doc_ids.len() as i32,
num_docs,
true,
)?;
add_terms_dict(meta, data, &term_refs)?;
} else {
meta.write_byte(1)?;
let doc_ids: Vec<i32> = ord_vals.iter().map(|(doc_id, _)| *doc_id).collect();
let all_ordinals: Vec<i64> = ord_vals
.iter()
.flat_map(|(_doc_id, ords)| ords.iter().copied())
.collect();
let num_docs_with_value = ord_vals.len() as i32;
let (num_docs_with_field, num_values) = write_values(
meta,
data,
&all_ordinals,
&doc_ids,
num_docs_with_value,
num_docs,
true,
)?;
meta.write_le_int(num_docs_with_field)?;
if num_values > num_docs_with_field as i64 {
let addresses_start = data.file_pointer() as i64;
meta.write_le_long(addresses_start)?;
meta.write_vint(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_ss_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut cumulative: i64 = 0;
for (_doc_id, ords) in &ord_vals {
dm_writer.add(cumulative);
cumulative += ords.len() as i64;
}
dm_writer.add(cumulative);
dm_writer.finish(meta, &mut address_buffer)?;
data.write_all(address_buffer.bytes())?;
meta.write_le_long(data.file_pointer() as i64 - addresses_start)?;
}
add_terms_dict(meta, data, &term_refs)?;
}
Ok(())
}
fn write_values(
meta: &mut dyn DataOutput,
data: &mut dyn IndexOutput,
all_values: &[i64],
doc_ids: &[i32],
num_docs_with_value: i32,
max_doc: i32,
ords: bool,
) -> io::Result<(i32, i64)> {
let num_values = all_values.len() as i64;
if num_docs_with_value == 0 {
meta.write_le_long(-2)?;
meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?; } else if num_docs_with_value == max_doc {
meta.write_le_long(-1)?;
meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?;
} else {
let offset = data.file_pointer() as i64;
meta.write_le_long(offset)?;
let jump_table_entry_count = indexed_disi::write_bit_set(doc_ids, max_doc, &mut *data)?;
meta.write_le_long(data.file_pointer() as i64 - offset)?;
meta.write_le_short(jump_table_entry_count)?;
meta.write_byte(indexed_disi::DEFAULT_DENSE_RANK_POWER as u8)?;
}
meta.write_le_long(num_values)?;
if num_values == 0 {
meta.write_le_int(-1)?; meta.write_byte(0)?; meta.write_le_long(0)?; meta.write_le_long(0)?; meta.write_le_long(data.file_pointer() as i64)?; meta.write_le_long(0)?; meta.write_le_long(-1)?; return Ok((0, 0));
}
let first_value = all_values[0];
let mut min = all_values[0];
let mut max = all_values[0];
let mut gcd: i64 = 0;
let mut unique_values: Option<BTreeSet<i64>> = if ords { None } else { Some(BTreeSet::new()) };
for &v in all_values {
min = min.min(v);
max = max.max(v);
if gcd != 1 {
if !(i64::MIN / 2..=i64::MAX / 2).contains(&v) {
gcd = 1;
} else {
gcd = gcd_compute(gcd, v - first_value);
}
}
if let Some(ref mut set) = unique_values {
set.insert(v);
if set.len() > 256 {
unique_values = None;
}
}
}
if ords && num_values > 0 {
assert!(min == 0, "min value for ordinals should be 0, got {}", min);
if max != 0 {
assert!(gcd == 1, "GCD on ordinals should be 1, got {}", gcd);
}
}
let num_bits_per_value: u32;
let mut encode_table: Option<HashMap<i64, i64>> = None;
if min >= max {
num_bits_per_value = 0;
meta.write_le_int(-1)?; } else if let Some(ref uv) = unique_values {
if uv.len() > 1 {
let table_bpv = unsigned_bits_required(uv.len() as i64 - 1);
let delta_bpv = unsigned_bits_required((max - min) / gcd);
if table_bpv < delta_bpv {
num_bits_per_value = table_bpv;
let sorted_unique: Vec<i64> = uv.iter().copied().collect();
meta.write_le_int(sorted_unique.len() as i32)?; for &v in &sorted_unique {
meta.write_le_long(v)?;
}
let mut enc = HashMap::new();
for (i, &v) in sorted_unique.iter().enumerate() {
enc.insert(v, i as i64);
}
encode_table = Some(enc);
min = 0;
gcd = 1;
} else {
num_bits_per_value = delta_bpv;
meta.write_le_int(-1)?; if gcd == 1
&& min > 0
&& unsigned_bits_required(max) == unsigned_bits_required(max - min)
{
min = 0;
}
}
} else {
num_bits_per_value = unsigned_bits_required((max - min) / gcd);
meta.write_le_int(-1)?;
}
} else {
num_bits_per_value = unsigned_bits_required((max - min) / gcd);
meta.write_le_int(-1)?; if gcd == 1 && min > 0 && unsigned_bits_required(max) == unsigned_bits_required(max - min) {
min = 0;
}
}
meta.write_byte(num_bits_per_value as u8)?;
meta.write_le_long(min)?;
meta.write_le_long(gcd)?;
let start_offset = data.file_pointer() as i64;
meta.write_le_long(start_offset)?;
if num_bits_per_value > 0 {
let mut writer = DirectWriter::new(num_bits_per_value);
if let Some(ref enc) = encode_table {
for &v in all_values {
writer.add(enc[&v]);
}
} else {
for &v in all_values {
writer.add((v - min) / gcd);
}
}
writer.finish(data)?;
}
let values_length = data.file_pointer() as i64 - start_offset;
meta.write_le_long(values_length)?; meta.write_le_long(-1)?;
Ok((num_docs_with_value, num_values))
}
fn gcd_compute(a: i64, b: i64) -> i64 {
let mut a = if a < 0 { a.wrapping_neg() } else { a };
let mut b = if b < 0 { b.wrapping_neg() } else { b };
if a == 0 {
return b;
}
if b == 0 {
return a;
}
let shift = (a | b).trailing_zeros();
a >>= a.trailing_zeros();
loop {
b >>= b.trailing_zeros();
if a == b {
break;
}
if (a as u64) > (b as u64) || a == i64::MIN {
mem::swap(&mut a, &mut b);
}
if a == 1 {
break;
}
b -= a;
}
a << shift
}
fn add_terms_dict(
mut meta: &mut dyn IndexOutput,
mut data: &mut dyn IndexOutput,
sorted_terms: &[&[u8]],
) -> io::Result<()> {
let size = sorted_terms.len() as i64;
meta.write_vlong(size)?;
let block_mask = TERMS_DICT_BLOCK_LZ4_MASK;
meta.write_le_int(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut previous: &[u8] = &[];
let mut max_length: i32 = 0;
let mut max_block_length: i32 = 0;
let start = data.file_pointer() as i64;
let mut suffix_buffer: Vec<u8> = Vec::new();
let mut dict_bytes: Vec<u8> = Vec::new();
let mut lz4_ht = FastHashTable::new();
for (ord, term) in sorted_terms.iter().enumerate() {
if (ord & block_mask) == 0 {
if ord != 0 {
let uncompressed_length =
compress_and_write_terms_block(data, &dict_bytes, &suffix_buffer, &mut lz4_ht)?;
max_block_length = max_block_length.max(uncompressed_length as i32);
suffix_buffer.clear();
}
dm_writer.add((data.file_pointer() as i64) - start);
data.write_vint(term.len() as i32)?;
data.write_all(term)?;
dict_bytes = term.to_vec();
} else {
let prefix_length = string_helper::bytes_difference(previous, term);
let suffix_length = term.len() - prefix_length;
assert!(suffix_length > 0, "duplicate terms in sorted set");
let byte = (prefix_length.min(15) as u8)
| ((suffix_length.saturating_sub(1).min(15) as u8) << 4);
suffix_buffer.push(byte);
if prefix_length >= 15 {
VecOutput(&mut suffix_buffer).write_vint((prefix_length - 15) as i32)?;
}
if suffix_length >= 16 {
VecOutput(&mut suffix_buffer).write_vint((suffix_length - 16) as i32)?;
}
suffix_buffer.extend_from_slice(&term[prefix_length..]);
}
max_length = max_length.max(term.len() as i32);
previous = term;
}
if !suffix_buffer.is_empty() {
let uncompressed_length =
compress_and_write_terms_block(data, &dict_bytes, &suffix_buffer, &mut lz4_ht)?;
max_block_length = max_block_length.max(uncompressed_length as i32);
}
dm_writer.finish(meta, &mut address_buffer)?;
meta.write_le_int(max_length)?;
meta.write_le_int(max_block_length)?;
meta.write_le_long(start)?;
meta.write_le_long(data.file_pointer() as i64 - start)?;
let addr_start = data.file_pointer() as i64;
data.write_all(address_buffer.bytes())?;
meta.write_le_long(addr_start)?;
meta.write_le_long(data.file_pointer() as i64 - addr_start)?;
write_terms_index(meta, data, sorted_terms)?;
Ok(())
}
fn compress_and_write_terms_block(
mut data: &mut dyn DataOutput,
dict_bytes: &[u8],
suffix_buffer: &[u8],
ht: &mut FastHashTable,
) -> io::Result<usize> {
let uncompressed_length = suffix_buffer.len();
data.write_vint(uncompressed_length as i32)?;
let mut combined = Vec::with_capacity(dict_bytes.len() + suffix_buffer.len());
combined.extend_from_slice(dict_bytes);
combined.extend_from_slice(suffix_buffer);
let compressed = lz4::compress_with_dictionary_reuse(&combined, dict_bytes.len(), ht);
data.write_all(&compressed)?;
Ok(uncompressed_length)
}
fn write_terms_index(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
sorted_terms: &[&[u8]],
) -> io::Result<()> {
meta.write_le_int(TERMS_DICT_REVERSE_INDEX_SHIFT)?;
let start = data.file_pointer() as i64;
let mut address_buffer = MemoryIndexOutput::new("temp_reverse_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut previous: Option<&[u8]> = None;
let mut offset: i64 = 0;
for (ord, term) in sorted_terms.iter().enumerate() {
if (ord & TERMS_DICT_REVERSE_INDEX_MASK) == 0 {
dm_writer.add(offset);
let sort_key_len = if ord == 0 {
0 } else {
string_helper::sort_key_length(previous.unwrap(), term)
};
offset += sort_key_len as i64;
data.write_all(&term[..sort_key_len])?;
}
if (ord & TERMS_DICT_REVERSE_INDEX_MASK) == TERMS_DICT_REVERSE_INDEX_MASK {
previous = Some(term);
}
}
dm_writer.add(offset);
dm_writer.finish(meta, &mut address_buffer)?;
meta.write_le_long(start)?;
meta.write_le_long(data.file_pointer() as i64 - start)?;
let addr_start = data.file_pointer() as i64;
data.write_all(address_buffer.bytes())?;
meta.write_le_long(addr_start)?;
meta.write_le_long(data.file_pointer() as i64 - addr_start)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codecs::codec_footers::FOOTER_LENGTH;
use crate::codecs::codec_headers::index_header_length;
use crate::codecs::lucene90::doc_values_producer::BufferedDocValuesProducer;
use crate::document::{DocValuesType, IndexOptions};
use crate::index::FieldInfo;
use crate::index::field_infos::PointDimensionConfig;
use crate::store::{MemoryDirectory, SharedDirectory};
use crate::test_util::TestDataReader;
use assertables::{assert_ge, assert_gt};
fn bytes(s: &str) -> Vec<u8> {
s.as_bytes().to_vec()
}
fn make_test_directory() -> SharedDirectory {
MemoryDirectory::create()
}
fn test_write(
directory: &dyn Directory,
segment_name: &str,
segment_suffix: &str,
segment_id: &[u8; 16],
fields: &[DocValuesFieldData],
num_docs: i32,
) -> io::Result<Vec<String>> {
let producer = BufferedDocValuesProducer::new(fields);
let field_infos: Vec<FieldInfo> = fields
.iter()
.map(|f| {
FieldInfo::new(
f.name.clone(),
f.number,
false,
true,
IndexOptions::None,
f.doc_values_type,
PointDimensionConfig::default(),
)
})
.collect();
let fi_refs: Vec<&FieldInfo> = field_infos.iter().collect();
write(
directory,
segment_name,
segment_suffix,
segment_id,
&fi_refs,
&producer,
num_docs,
)
}
fn nv(doc_id: i32, value: i64) -> NumericDocValue {
NumericDocValue { doc_id, value }
}
fn bv(doc_id: i32, value: Vec<u8>) -> BinaryDocValue {
BinaryDocValue { doc_id, value }
}
fn sv(doc_id: i32, value: Vec<u8>) -> SortedDocValue {
SortedDocValue { doc_id, value }
}
fn snv(doc_id: i32, values: Vec<i64>) -> SortedNumericDocValue {
SortedNumericDocValue { doc_id, values }
}
fn ssv(doc_id: i32, values: Vec<Vec<u8>>) -> SortedSetDocValue {
SortedSetDocValue { doc_id, values }
}
fn make_field_data_numeric(
name: &str,
number: u32,
values: Vec<NumericDocValue>,
) -> DocValuesFieldData {
DocValuesFieldData {
name: name.to_string(),
number,
doc_values_type: DocValuesType::Numeric,
doc_values: DocValuesAccumulator::Numeric(values),
}
}
fn make_field_data_binary(
name: &str,
number: u32,
values: Vec<BinaryDocValue>,
) -> DocValuesFieldData {
DocValuesFieldData {
name: name.to_string(),
number,
doc_values_type: DocValuesType::Binary,
doc_values: DocValuesAccumulator::Binary(values),
}
}
fn make_field_data_sorted(
name: &str,
number: u32,
values: Vec<SortedDocValue>,
) -> DocValuesFieldData {
DocValuesFieldData {
name: name.to_string(),
number,
doc_values_type: DocValuesType::Sorted,
doc_values: DocValuesAccumulator::Sorted(values),
}
}
fn make_field_data_sorted_numeric(
name: &str,
number: u32,
values: Vec<SortedNumericDocValue>,
) -> DocValuesFieldData {
DocValuesFieldData {
name: name.to_string(),
number,
doc_values_type: DocValuesType::SortedNumeric,
doc_values: DocValuesAccumulator::SortedNumeric(values),
}
}
fn make_field_data_sorted_set(
name: &str,
number: u32,
values: Vec<SortedSetDocValue>,
) -> DocValuesFieldData {
DocValuesFieldData {
name: name.to_string(),
number,
doc_values_type: DocValuesType::SortedSet,
doc_values: DocValuesAccumulator::SortedSet(values),
}
}
#[test]
fn test_gcd_compute() {
assert_eq!(gcd_compute(0, 5), 5);
assert_eq!(gcd_compute(5, 0), 5);
assert_eq!(gcd_compute(12, 8), 4);
assert_eq!(gcd_compute(0, 0), 0);
assert_eq!(gcd_compute(7, 7), 7);
assert_eq!(gcd_compute(100, 75), 25);
assert_eq!(gcd_compute(-12, 8), 4);
}
#[test]
fn test_vec_output_vint() {
let mut buf = Vec::new();
VecOutput(&mut buf).write_vint(0).unwrap();
assert_eq!(buf, vec![0]);
let mut buf = Vec::new();
VecOutput(&mut buf).write_vint(127).unwrap();
assert_eq!(buf, vec![0x7F]);
let mut buf = Vec::new();
VecOutput(&mut buf).write_vint(128).unwrap();
assert_eq!(buf, vec![0x80, 0x01]);
}
#[test]
fn test_sorted_numeric_constant() {
let fields = vec![make_field_data_sorted_numeric(
"modified",
1,
vec![snv(0, vec![42]), snv(1, vec![42]), snv(2, vec![42])],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
assert_len_eq_x!(&result, 2);
assert_eq!(result[0], "_0_Lucene90_0.dvm");
assert_eq!(result[1], "_0_Lucene90_0.dvd");
let dvm = directory.read_file(&result[0]).unwrap();
assert_eq!(&dvm[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &1i32.to_le_bytes());
assert_eq!(entry[4], SORTED_NUMERIC);
assert_eq!(&entry[5..13], &(-1i64).to_le_bytes());
assert_eq!(&entry[13..21], &0i64.to_le_bytes());
assert_eq!(&entry[21..23], &(-1i16).to_le_bytes());
assert_eq!(entry[23], 0xFF);
assert_eq!(&entry[24..32], &3i64.to_le_bytes());
assert_eq!(&entry[32..36], &(-1i32).to_le_bytes());
assert_eq!(entry[36], 0);
assert_eq!(&entry[37..45], &42i64.to_le_bytes());
let footer_start = dvm.len() - FOOTER_LENGTH;
assert_eq!(
&dvm[footer_start..footer_start + 4],
&[0xc0, 0x28, 0x93, 0xe8]
);
}
#[test]
fn test_sorted_numeric_different() {
let fields = vec![make_field_data_sorted_numeric(
"modified",
1,
vec![snv(0, vec![1000]), snv(1, vec![2000]), snv(2, vec![3000])],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let dvd = directory.read_file(&result[1]).unwrap();
assert_not_empty!(dvm);
assert_not_empty!(dvd);
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &1i32.to_le_bytes());
assert_eq!(entry[4], SORTED_NUMERIC);
assert_eq!(&entry[24..32], &3i64.to_le_bytes());
let gcd_offset = 37 + 8; let gcd_val = i64::from_le_bytes(entry[gcd_offset..gcd_offset + 8].try_into().unwrap());
assert_eq!(gcd_val, 1000);
}
#[test]
fn test_sorted_set_single_valued() {
let fields = vec![make_field_data_sorted_set(
"path",
0,
vec![
ssv(0, vec![bytes("/a.txt")]),
ssv(1, vec![bytes("/b.txt")]),
ssv(2, vec![bytes("/c.txt")]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let dvd = directory.read_file(&result[1]).unwrap();
assert_not_empty!(dvm);
assert_not_empty!(dvd);
assert_eq!(&dvm[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
assert_eq!(&dvd[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], SORTED_SET);
assert_eq!(entry[5], 0);
let footer_start = dvm.len() - FOOTER_LENGTH;
assert_eq!(
&dvm[footer_start..footer_start + 4],
&[0xc0, 0x28, 0x93, 0xe8]
);
}
#[test]
fn test_header_footer_eof() {
let fields = vec![make_field_data_sorted_numeric(
"modified",
0,
vec![snv(0, vec![1])],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "", &segment_id, &fields, 1).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let eof_pos = dvm.len() - FOOTER_LENGTH - 4;
assert_eq!(&dvm[eof_pos..eof_pos + 4], &(-1i32).to_le_bytes());
}
#[test]
fn test_terms_dict_prefix_compression_byte() {
let prefix_len = 3usize;
let suffix_len = 5usize;
let byte = (prefix_len.min(15) as u8) | ((suffix_len.saturating_sub(1).min(15) as u8) << 4);
assert_eq!(byte, 0x43);
let byte2 = 15u8;
assert_eq!(byte2, 0x0F);
}
#[test]
fn test_two_fields_combined() {
let fields = vec![
make_field_data_sorted_set(
"path",
0,
vec![
ssv(0, vec![bytes("/a.txt")]),
ssv(1, vec![bytes("/b.txt")]),
ssv(2, vec![bytes("/c.txt")]),
],
),
make_field_data_sorted_numeric(
"modified",
1,
vec![snv(0, vec![1000]), snv(1, vec![2000]), snv(2, vec![3000])],
),
];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
assert_len_eq_x!(&result, 2);
assert_eq!(result[0], "_0_Lucene90_0.dvm");
assert_eq!(result[1], "_0_Lucene90_0.dvd");
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes()); assert_eq!(entry[4], SORTED_SET);
let eof_pos = dvm.len() - FOOTER_LENGTH - 4;
assert_eq!(&dvm[eof_pos..eof_pos + 4], &(-1i32).to_le_bytes());
}
struct DvmReader<'a>(TestDataReader<'a>);
impl<'a> DvmReader<'a> {
fn new(data: &'a [u8], start: usize) -> Self {
Self(TestDataReader::new(data, start))
}
fn read_numeric(&mut self) -> i64 {
let _docs_with_field_offset = self.0.read_le_long();
let _docs_with_field_length = self.0.read_le_long();
let _jump_table_entry_count = self.0.read_le_short();
let _dense_rank_power = self.0.read_byte();
let num_values = self.0.read_le_long();
let table_size = self.0.read_le_int();
if table_size >= 0 {
for _ in 0..table_size {
let _table_val = self.0.read_le_long();
}
}
let _bits_per_value = self.0.read_byte();
let _min_value = self.0.read_le_long();
let _gcd = self.0.read_le_long();
let _values_offset = self.0.read_le_long();
let _values_length = self.0.read_le_long();
let _value_jump_table_offset = self.0.read_le_long();
num_values
}
fn read_dm_meta(&mut self, num_values: i64, _block_shift: i32) {
let num_blocks = (num_values + (1 << 16) - 1) / (1 << 16);
for _ in 0..num_blocks {
let _min = self.0.read_le_long();
let _avg_inc = self.0.read_le_int(); let _offset = self.0.read_le_long();
let _bits_required = self.0.read_byte();
}
}
fn read_term_dict(&mut self) {
let terms_dict_size = self.0.read_vlong();
let block_shift = self.0.read_le_int();
let addresses_size = (terms_dict_size + (1i64 << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1)
>> TERMS_DICT_BLOCK_LZ4_SHIFT;
self.read_dm_meta(addresses_size, block_shift);
let _max_term_length = self.0.read_le_int();
let _max_block_length = self.0.read_le_int();
let _terms_data_offset = self.0.read_le_long();
let _terms_data_length = self.0.read_le_long();
let _terms_addresses_offset = self.0.read_le_long();
let _terms_addresses_length = self.0.read_le_long();
let terms_dict_index_shift = self.0.read_le_int();
let index_size =
(terms_dict_size + (1i64 << terms_dict_index_shift) - 1) >> terms_dict_index_shift;
self.read_dm_meta(1 + index_size, block_shift);
let _terms_index_offset = self.0.read_le_long();
let _terms_index_length = self.0.read_le_long();
let _terms_index_addresses_offset = self.0.read_le_long();
let _terms_index_addresses_length = self.0.read_le_long();
}
fn read_sorted(&mut self) {
let _ = self.read_numeric();
self.read_term_dict();
}
fn read_binary(&mut self) {
let _data_offset = self.0.read_le_long();
let _data_length = self.0.read_le_long();
let _docs_with_field_offset = self.0.read_le_long();
let _docs_with_field_length = self.0.read_le_long();
let _jump_table_entry_count = self.0.read_le_short();
let _dense_rank_power = self.0.read_byte();
let num_docs_with_field = self.0.read_le_int();
let min_length = self.0.read_le_int();
let max_length = self.0.read_le_int();
if max_length > min_length {
let _addresses_offset = self.0.read_le_long();
let block_shift = self.0.read_vint();
let num_addresses = num_docs_with_field as i64 + 1;
self.read_dm_meta(num_addresses, block_shift);
let _addresses_length = self.0.read_le_long();
}
}
fn read_sorted_set(&mut self) {
let multi_valued = self.0.read_byte();
if multi_valued == 0 {
let _ = self.read_numeric();
self.read_term_dict();
} else {
self.read_sorted_numeric();
self.read_term_dict();
}
}
fn read_sorted_numeric(&mut self) {
let num_values = self.read_numeric();
let num_docs_with_field = self.0.read_le_int();
if num_values > num_docs_with_field as i64 {
let _addresses_offset = self.0.read_le_long();
let block_shift = self.0.read_vint();
let num_addresses = num_docs_with_field as i64 + 1;
self.read_dm_meta(num_addresses, block_shift);
let _addresses_length = self.0.read_le_long();
}
}
}
#[test]
fn test_two_fields_dvm_parseable_like_java() {
let fields = vec![
make_field_data_sorted_set(
"path",
0,
vec![
ssv(0, vec![bytes("/a.txt")]),
ssv(1, vec![bytes("/b.txt")]),
ssv(2, vec![bytes("/c.txt")]),
],
),
make_field_data_sorted_numeric(
"modified",
1,
vec![snv(0, vec![1000]), snv(1, vec![2000]), snv(2, vec![3000])],
),
];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0, "first field number should be 0");
let type0 = reader.0.read_byte();
assert_eq!(type0, SORTED_SET, "first field type should be SORTED_SET");
reader.read_sorted_set();
let field1 = reader.0.read_le_int();
assert_eq!(
field1, 1,
"second field number should be 1 (got {field1} — extra bytes written for SORTED_SET?)"
);
let type1 = reader.0.read_byte();
assert_eq!(
type1, SORTED_NUMERIC,
"second field type should be SORTED_NUMERIC"
);
reader.read_sorted_numeric();
let eof = reader.0.read_le_int();
assert_eq!(eof, -1, "expected EOF marker (-1)");
}
#[test]
fn test_sorted_numeric_with_gcd() {
let fields = vec![make_field_data_sorted_numeric(
"field",
0,
vec![snv(0, vec![100]), snv(1, vec![200]), snv(2, vec![300])],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "", &segment_id, &fields, 3).unwrap();
assert_len_eq_x!(&result, 2);
}
#[test]
fn test_sorted_set_identical_paths() {
let fields = vec![make_field_data_sorted_set(
"path",
0,
vec![
ssv(0, vec![bytes("/same.txt")]),
ssv(1, vec![bytes("/same.txt")]),
ssv(2, vec![bytes("/same.txt")]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "", &segment_id, &fields, 3).unwrap();
assert_len_eq_x!(&result, 2);
}
#[test]
fn test_numeric_constant() {
let fields = vec![make_field_data_numeric(
"count",
0,
vec![nv(0, 42), nv(1, 42), nv(2, 42)],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], NUMERIC);
assert_eq!(&entry[5..13], &(-1i64).to_le_bytes());
assert_eq!(&entry[24..32], &3i64.to_le_bytes());
assert_eq!(&entry[32..36], &(-1i32).to_le_bytes());
assert_eq!(entry[36], 0);
assert_eq!(&entry[37..45], &42i64.to_le_bytes());
}
#[test]
fn test_numeric_different_values() {
let fields = vec![make_field_data_numeric(
"score",
0,
vec![nv(0, 10), nv(1, 20), nv(2, 30)],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let dvd = directory.read_file(&result[1]).unwrap();
assert_not_empty!(dvm);
assert_not_empty!(dvd);
}
#[test]
fn test_numeric_no_num_docs_with_field() {
let fields = vec![make_field_data_numeric(
"count",
0,
vec![nv(0, 100), nv(1, 200), nv(2, 300)],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, NUMERIC);
reader.read_numeric();
let eof = reader.0.read_le_int();
assert_eq!(
eof, -1,
"expected EOF marker — NUMERIC should not write numDocsWithField"
);
}
#[test]
fn test_sorted_field() {
let fields = vec![make_field_data_sorted(
"category",
0,
vec![
sv(0, bytes("alpha")),
sv(1, bytes("beta")),
sv(2, bytes("alpha")),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], SORTED);
}
#[test]
fn test_sorted_parseable() {
let fields = vec![make_field_data_sorted(
"category",
0,
vec![sv(0, bytes("x")), sv(1, bytes("y")), sv(2, bytes("z"))],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, SORTED);
reader.read_sorted();
let eof = reader.0.read_le_int();
assert_eq!(eof, -1, "expected EOF marker");
}
#[test]
fn test_binary_fixed_length() {
let fields = vec![make_field_data_binary(
"hash",
0,
vec![
bv(0, vec![0xAA, 0xBB, 0xCC, 0xDD]),
bv(1, vec![0x11, 0x22, 0x33, 0x44]),
bv(2, vec![0xFF, 0xEE, 0xDD, 0xCC]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], BINARY);
}
#[test]
fn test_binary_parseable() {
let fields = vec![make_field_data_binary(
"data",
0,
vec![
bv(0, vec![1, 2, 3]),
bv(1, vec![4, 5, 6]),
bv(2, vec![7, 8, 9]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, BINARY);
reader.read_binary();
let eof = reader.0.read_le_int();
assert_eq!(eof, -1, "expected EOF marker");
}
#[test]
fn test_binary_variable_length() {
let fields = vec![make_field_data_binary(
"payload",
0,
vec![bv(0, vec![1]), bv(1, vec![2, 3, 4]), bv(2, vec![5, 6])],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, BINARY);
reader.read_binary();
let eof = reader.0.read_le_int();
assert_eq!(
eof, -1,
"expected EOF marker — variable-length binary should be parseable"
);
}
#[test]
fn test_all_dv_types_combined() {
let fields = vec![
make_field_data_numeric("num", 0, vec![nv(0, 10), nv(1, 20), nv(2, 30)]),
make_field_data_binary(
"bin",
1,
vec![bv(0, vec![0xAA]), bv(1, vec![0xBB]), bv(2, vec![0xCC])],
),
make_field_data_sorted(
"sort",
2,
vec![sv(0, bytes("a")), sv(1, bytes("b")), sv(2, bytes("c"))],
),
make_field_data_sorted_set(
"sortset",
3,
vec![
ssv(0, vec![bytes("x")]),
ssv(1, vec![bytes("y")]),
ssv(2, vec![bytes("z")]),
],
),
make_field_data_sorted_numeric(
"sortnum",
4,
vec![snv(0, vec![100]), snv(1, vec![200]), snv(2, vec![300])],
),
];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), NUMERIC);
reader.read_numeric();
assert_eq!(reader.0.read_le_int(), 1);
assert_eq!(reader.0.read_byte(), BINARY);
reader.read_binary();
assert_eq!(reader.0.read_le_int(), 2);
assert_eq!(reader.0.read_byte(), SORTED);
reader.read_sorted();
assert_eq!(reader.0.read_le_int(), 3);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), 4);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_numeric_multi_valued() {
let fields = vec![make_field_data_sorted_numeric(
"tags",
0,
vec![
snv(0, vec![100]),
snv(1, vec![300, 200]), snv(2, vec![600, 400, 500]), ],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
assert_len_eq_x!(&result, 2);
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_numeric_multi_valued_values_sorted() {
let fields = vec![make_field_data_sorted_numeric(
"nums",
0,
vec![
snv(0, vec![100]),
snv(1, vec![300, 200]),
snv(2, vec![600, 400, 500]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
let num_values = i64::from_le_bytes(entry[24..32].try_into().unwrap());
assert_eq!(num_values, 6, "total values across all docs should be 6");
}
#[test]
fn test_two_fields_multi_valued_sorted_numeric_dvm_parseable() {
let fields = vec![
make_field_data_sorted_set(
"path",
0,
vec![
ssv(0, vec![bytes("/a.txt")]),
ssv(1, vec![bytes("/b.txt")]),
ssv(2, vec![bytes("/c.txt")]),
],
),
make_field_data_sorted_numeric(
"counts",
1,
vec![
snv(0, vec![10, 20]),
snv(1, vec![30]),
snv(2, vec![40, 50, 60]),
],
),
];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), 1);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_set_multi_valued() {
let fields = vec![make_field_data_sorted_set(
"tags",
0,
vec![
ssv(0, vec![bytes("alpha")]),
ssv(1, vec![bytes("gamma"), bytes("beta")]),
ssv(2, vec![bytes("delta"), bytes("alpha"), bytes("gamma")]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
assert_len_eq_x!(&result, 2);
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_set_multi_valued_dedup() {
let fields = vec![make_field_data_sorted_set(
"tags",
0,
vec![
ssv(0, vec![bytes("alpha"), bytes("alpha"), bytes("beta")]),
ssv(1, vec![bytes("beta"), bytes("beta")]),
],
)];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 2).unwrap();
assert_len_eq_x!(&result, 2);
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_two_fields_multi_valued_sorted_set_and_sorted_numeric() {
let fields = vec![
make_field_data_sorted_set(
"tags",
0,
vec![
ssv(0, vec![bytes("a"), bytes("b")]),
ssv(1, vec![bytes("c")]),
ssv(2, vec![bytes("a"), bytes("c")]),
],
),
make_field_data_sorted_numeric(
"nums",
1,
vec![
snv(0, vec![10, 20]),
snv(1, vec![30]),
snv(2, vec![40, 50, 60]),
],
),
];
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = test_write(&directory, "_0", "Lucene90_0", &segment_id, &fields, 3).unwrap();
let dvm = directory.read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), 1);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sparse_numeric_field() {
let fields = vec![make_field_data_numeric(
"score",
0,
vec![nv(1, 100), nv(5, 200), nv(8, 300)],
)];
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = test_write(&dir, "_0", "", &segment_id, &fields, 10).unwrap();
let dvm = dir.read_file(&names[0]).unwrap();
let dvd = dir.read_file(&names[1]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), NUMERIC);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let jump_table_entry_count = reader.read_le_short();
assert_ge!(jump_table_entry_count, 0);
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
let data_header_len = index_header_length(DATA_CODEC, "");
let disi_start = docs_with_field_offset as usize;
assert_ge!(disi_start, data_header_len);
assert_eq!(
docs_with_field_length as usize,
(disi_start + docs_with_field_length as usize) - disi_start
);
let block_id = i16::from_le_bytes(dvd[disi_start..disi_start + 2].try_into().unwrap());
assert_eq!(block_id, 0);
let card_minus_1 =
i16::from_le_bytes(dvd[disi_start + 2..disi_start + 4].try_into().unwrap());
assert_eq!(card_minus_1, 2);
}
#[test]
fn test_sparse_binary_field() {
let fields = vec![make_field_data_binary(
"tag",
0,
vec![bv(1, b"hello".to_vec()), bv(3, b"world".to_vec())],
)];
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = test_write(&dir, "_0", "", &segment_id, &fields, 5).unwrap();
let dvm = dir.read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), BINARY);
let _data_offset = reader.read_le_long();
let _data_length = reader.read_le_long();
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let jump_table_entry_count = reader.read_le_short();
assert_ge!(jump_table_entry_count, 0);
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
assert_eq!(reader.read_le_int(), 2);
}
#[test]
fn test_sparse_sorted_field() {
let fields = vec![make_field_data_sorted(
"category",
0,
vec![sv(0, b"alpha".to_vec()), sv(3, b"beta".to_vec())],
)];
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = test_write(&dir, "_0", "", &segment_id, &fields, 5).unwrap();
let dvm = dir.read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), SORTED);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let _jump_table_entry_count = reader.read_le_short();
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
}
#[test]
fn test_sparse_sorted_numeric_field() {
let fields = vec![make_field_data_sorted_numeric(
"counts",
0,
vec![snv(1, vec![10, 20]), snv(4, vec![30])],
)];
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = test_write(&dir, "_0", "", &segment_id, &fields, 5).unwrap();
let dvm = dir.read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), SORTED_NUMERIC);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let _jump_table_entry_count = reader.read_le_short();
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
}
}