use std::collections::{BTreeSet, HashMap};
use std::io;
use log::debug;
use crate::codecs::codec_util;
use crate::codecs::lucene90::indexed_disi;
use crate::codecs::packed_writers::{DirectMonotonicWriter, DirectWriter};
use crate::encoding::lz4::{self, FastHashTable};
use crate::encoding::packed::unsigned_bits_required;
use crate::index::FieldInfos;
use crate::index::index_file_names;
use crate::index::indexing_chain::{DocValuesAccumulator, PerFieldData};
use crate::store::memory::MemoryIndexOutput;
use crate::store::{DataOutput, IndexOutput, SharedDirectory, VecOutput};
use crate::util::BytesRef;
use crate::util::string_helper;
pub(crate) const DATA_EXTENSION: &str = "dvd";
pub(crate) const META_EXTENSION: &str = "dvm";
pub(crate) const DATA_CODEC: &str = "Lucene90DocValuesData";
pub(crate) const META_CODEC: &str = "Lucene90DocValuesMetadata";
pub(crate) const VERSION: i32 = 0;
pub(crate) const NUMERIC: u8 = 0;
pub(crate) const BINARY: u8 = 1;
pub(crate) const SORTED: u8 = 2;
pub(crate) const SORTED_SET: u8 = 3;
pub(crate) const SORTED_NUMERIC: u8 = 4;
const TERMS_DICT_BLOCK_LZ4_SHIFT: usize = 6;
const TERMS_DICT_BLOCK_LZ4_SIZE: usize = 1 << TERMS_DICT_BLOCK_LZ4_SHIFT;
const TERMS_DICT_BLOCK_LZ4_MASK: usize = TERMS_DICT_BLOCK_LZ4_SIZE - 1;
const TERMS_DICT_REVERSE_INDEX_SHIFT: i32 = 10;
const TERMS_DICT_REVERSE_INDEX_SIZE: usize = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
const TERMS_DICT_REVERSE_INDEX_MASK: usize = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
pub(crate) const DIRECT_MONOTONIC_BLOCK_SHIFT: u32 = 16;
pub fn write(
directory: &SharedDirectory,
segment_name: &str,
segment_suffix: &str,
segment_id: &[u8; 16],
field_infos: &FieldInfos,
per_field: &HashMap<String, PerFieldData>,
num_docs: i32,
) -> io::Result<Vec<String>> {
let dvm_name =
index_file_names::segment_file_name(segment_name, segment_suffix, META_EXTENSION);
let dvd_name =
index_file_names::segment_file_name(segment_name, segment_suffix, DATA_EXTENSION);
let (mut meta, mut data) = {
let mut dir = directory.lock().unwrap();
(dir.create_output(&dvm_name)?, dir.create_output(&dvd_name)?)
};
codec_util::write_index_header(&mut *meta, META_CODEC, VERSION, segment_id, segment_suffix)?;
codec_util::write_index_header(&mut *data, DATA_CODEC, VERSION, segment_id, segment_suffix)?;
for fi in field_infos.iter() {
if !fi.has_doc_values() {
continue;
}
let Some(pfd) = per_field.get(fi.name()) else {
continue;
};
match &pfd.doc_values {
DocValuesAccumulator::Numeric(vals) => {
debug!(
"doc_values: field={:?} (#{}) -> NUMERIC, {} docs",
fi.name(),
fi.number(),
vals.len()
);
meta.write_le_int(fi.number() as i32)?;
meta.write_byte(NUMERIC)?;
add_numeric_field(&mut *meta, &mut *data, vals, num_docs)?;
}
DocValuesAccumulator::Binary(vals) => {
debug!(
"doc_values: field={:?} (#{}) -> BINARY, {} docs",
fi.name(),
fi.number(),
vals.len()
);
meta.write_le_int(fi.number() as i32)?;
meta.write_byte(BINARY)?;
add_binary_field(&mut *meta, &mut *data, vals, num_docs)?;
}
DocValuesAccumulator::Sorted(vals) => {
debug!(
"doc_values: field={:?} (#{}) -> SORTED, {} docs",
fi.name(),
fi.number(),
vals.len()
);
meta.write_le_int(fi.number() as i32)?;
meta.write_byte(SORTED)?;
add_sorted_field(&mut *meta, &mut *data, vals, num_docs)?;
}
DocValuesAccumulator::SortedNumeric(vals) => {
debug!(
"doc_values: field={:?} (#{}) -> SORTED_NUMERIC, {} docs",
fi.name(),
fi.number(),
vals.len()
);
meta.write_le_int(fi.number() as i32)?;
meta.write_byte(SORTED_NUMERIC)?;
add_sorted_numeric_field(&mut *meta, &mut *data, vals, num_docs)?;
}
DocValuesAccumulator::SortedSet(vals) => {
debug!(
"doc_values: field={:?} (#{}) -> SORTED_SET, {} docs",
fi.name(),
fi.number(),
vals.len()
);
meta.write_le_int(fi.number() as i32)?;
meta.write_byte(SORTED_SET)?;
add_sorted_set_field(&mut *meta, &mut *data, vals, num_docs)?;
}
DocValuesAccumulator::None => continue,
}
}
meta.write_le_int(-1)?;
codec_util::write_footer(&mut *meta)?;
codec_util::write_footer(&mut *data)?;
Ok(vec![dvm_name, dvd_name])
}
fn add_numeric_field(
meta: &mut dyn DataOutput,
data: &mut dyn IndexOutput,
vals: &[(i32, i64)],
num_docs: i32,
) -> io::Result<()> {
let doc_ids: Vec<i32> = vals.iter().map(|(doc_id, _)| *doc_id).collect();
let all_values: Vec<i64> = vals.iter().map(|(_doc_id, v)| *v).collect();
write_values(
meta,
data,
&all_values,
&doc_ids,
vals.len() as i32,
num_docs,
false,
)?;
Ok(())
}
fn add_binary_field(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
vals: &[(i32, Vec<u8>)],
num_docs: i32,
) -> io::Result<()> {
let start = data.file_pointer() as i64;
meta.write_le_long(start)?;
let mut min_length = i32::MAX;
let mut max_length = 0i32;
for (_doc_id, bytes) in vals {
let len = bytes.len() as i32;
min_length = min_length.min(len);
max_length = max_length.max(len);
data.write_bytes(bytes)?;
}
let num_docs_with_field = vals.len() as i32;
if num_docs_with_field == 0 {
min_length = 0;
}
meta.write_le_long(data.file_pointer() as i64 - start)?;
if num_docs_with_field == 0 {
meta.write_le_long(-2)?; meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?;
} else if num_docs_with_field == num_docs {
meta.write_le_long(-1)?; meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?;
} else {
let doc_ids: Vec<i32> = vals.iter().map(|(doc_id, _)| *doc_id).collect();
let disi_offset = data.file_pointer() as i64;
meta.write_le_long(disi_offset)?;
let jump_table_entry_count = indexed_disi::write_bit_set(&doc_ids, num_docs, &mut *data)?;
meta.write_le_long(data.file_pointer() as i64 - disi_offset)?;
meta.write_le_short(jump_table_entry_count)?;
meta.write_byte(indexed_disi::DEFAULT_DENSE_RANK_POWER as u8)?;
}
meta.write_le_int(num_docs_with_field)?;
meta.write_le_int(min_length)?;
meta.write_le_int(max_length)?;
if max_length > min_length {
let addresses_start = data.file_pointer() as i64;
meta.write_le_long(addresses_start)?;
meta.write_vint(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_binary_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut cumulative: i64 = 0;
for (_doc_id, bytes) in vals {
dm_writer.add(cumulative);
cumulative += bytes.len() as i64;
}
dm_writer.add(cumulative);
dm_writer.finish(meta, &mut address_buffer)?;
data.write_bytes(address_buffer.bytes())?;
meta.write_le_long(data.file_pointer() as i64 - addresses_start)?;
}
Ok(())
}
fn add_sorted_field(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
vals: &[(i32, BytesRef)],
num_docs: i32,
) -> io::Result<()> {
let mut unique_terms: BTreeSet<BytesRef> = BTreeSet::new();
for (_doc_id, v) in vals {
unique_terms.insert(v.clone());
}
let mut ord_map: HashMap<BytesRef, i64> = HashMap::with_capacity(unique_terms.len());
let sorted_terms: Vec<BytesRef> = unique_terms
.into_iter()
.enumerate()
.map(|(i, term)| {
ord_map.insert(term.clone(), i as i64);
term
})
.collect();
let doc_ids: Vec<i32> = vals.iter().map(|(doc_id, _)| *doc_id).collect();
let ordinals: Vec<i64> = vals.iter().map(|(_doc_id, v)| ord_map[v]).collect();
write_values(
meta,
data,
&ordinals,
&doc_ids,
vals.len() as i32,
num_docs,
true,
)?;
add_terms_dict(meta, data, &sorted_terms)?;
Ok(())
}
fn add_sorted_numeric_field(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
vals: &[(i32, Vec<i64>)],
num_docs: i32,
) -> io::Result<()> {
let mut sorted_vals: Vec<(i32, Vec<i64>)> = vals.to_vec();
for (_doc_id, values) in sorted_vals.iter_mut() {
values.sort();
}
let doc_ids: Vec<i32> = sorted_vals.iter().map(|(doc_id, _)| *doc_id).collect();
let all_values: Vec<i64> = sorted_vals
.iter()
.flat_map(|(_doc_id, values)| values.iter().copied())
.collect();
let num_docs_with_value = sorted_vals.len() as i32;
let (num_docs_with_field, num_values) = write_values(
meta,
data,
&all_values,
&doc_ids,
num_docs_with_value,
num_docs,
false,
)?;
meta.write_le_int(num_docs_with_field)?;
if num_values > num_docs_with_field as i64 {
let addresses_start = data.file_pointer() as i64;
meta.write_le_long(addresses_start)?;
meta.write_vint(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_sn_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut cumulative: i64 = 0;
for (_doc_id, values) in &sorted_vals {
dm_writer.add(cumulative);
cumulative += values.len() as i64;
}
dm_writer.add(cumulative);
dm_writer.finish(meta, &mut address_buffer)?;
data.write_bytes(address_buffer.bytes())?;
meta.write_le_long(data.file_pointer() as i64 - addresses_start)?;
}
Ok(())
}
fn add_sorted_set_field(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
vals: &[(i32, Vec<BytesRef>)],
num_docs: i32,
) -> io::Result<()> {
let is_single_valued = vals.iter().all(|(_, v)| v.len() == 1);
let mut unique_terms: BTreeSet<BytesRef> = BTreeSet::new();
for (_doc_id, values) in vals {
for v in values {
unique_terms.insert(v.clone());
}
}
let mut ord_map: HashMap<BytesRef, i64> = HashMap::with_capacity(unique_terms.len());
let sorted_terms: Vec<BytesRef> = unique_terms
.into_iter()
.enumerate()
.map(|(i, term)| {
ord_map.insert(term.clone(), i as i64);
term
})
.collect();
if is_single_valued {
meta.write_byte(0)?;
let doc_ids: Vec<i32> = vals.iter().map(|(doc_id, _)| *doc_id).collect();
let ordinals: Vec<i64> = vals
.iter()
.map(|(_doc_id, values)| ord_map[&values[0]])
.collect();
write_values(
meta,
data,
&ordinals,
&doc_ids,
vals.len() as i32,
num_docs,
true,
)?;
add_terms_dict(meta, data, &sorted_terms)?;
} else {
meta.write_byte(1)?;
let ord_vals: Vec<(i32, Vec<i64>)> = vals
.iter()
.map(|(doc_id, values)| {
let mut ords: Vec<i64> = values.iter().map(|v| ord_map[v]).collect();
ords.sort();
ords.dedup();
(*doc_id, ords)
})
.collect();
let doc_ids: Vec<i32> = ord_vals.iter().map(|(doc_id, _)| *doc_id).collect();
let all_ordinals: Vec<i64> = ord_vals
.iter()
.flat_map(|(_doc_id, ords)| ords.iter().copied())
.collect();
let num_docs_with_value = ord_vals.len() as i32;
let (num_docs_with_field, num_values) = write_values(
meta,
data,
&all_ordinals,
&doc_ids,
num_docs_with_value,
num_docs,
true,
)?;
meta.write_le_int(num_docs_with_field)?;
if num_values > num_docs_with_field as i64 {
let addresses_start = data.file_pointer() as i64;
meta.write_le_long(addresses_start)?;
meta.write_vint(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_ss_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut cumulative: i64 = 0;
for (_doc_id, ords) in &ord_vals {
dm_writer.add(cumulative);
cumulative += ords.len() as i64;
}
dm_writer.add(cumulative);
dm_writer.finish(meta, &mut address_buffer)?;
data.write_bytes(address_buffer.bytes())?;
meta.write_le_long(data.file_pointer() as i64 - addresses_start)?;
}
add_terms_dict(meta, data, &sorted_terms)?;
}
Ok(())
}
fn write_values(
meta: &mut dyn DataOutput,
data: &mut dyn IndexOutput,
all_values: &[i64],
doc_ids: &[i32],
num_docs_with_value: i32,
max_doc: i32,
ords: bool,
) -> io::Result<(i32, i64)> {
let num_values = all_values.len() as i64;
if num_docs_with_value == 0 {
meta.write_le_long(-2)?;
meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?; } else if num_docs_with_value == max_doc {
meta.write_le_long(-1)?;
meta.write_le_long(0)?;
meta.write_le_short(-1)?;
meta.write_byte(0xFF)?;
} else {
let offset = data.file_pointer() as i64;
meta.write_le_long(offset)?;
let jump_table_entry_count = indexed_disi::write_bit_set(doc_ids, max_doc, &mut *data)?;
meta.write_le_long(data.file_pointer() as i64 - offset)?;
meta.write_le_short(jump_table_entry_count)?;
meta.write_byte(indexed_disi::DEFAULT_DENSE_RANK_POWER as u8)?;
}
meta.write_le_long(num_values)?;
if num_values == 0 {
meta.write_le_int(-1)?; meta.write_byte(0)?; meta.write_le_long(0)?; meta.write_le_long(0)?; meta.write_le_long(data.file_pointer() as i64)?; meta.write_le_long(0)?; meta.write_le_long(-1)?; return Ok((0, 0));
}
let first_value = all_values[0];
let mut min = all_values[0];
let mut max = all_values[0];
let mut gcd: i64 = 0;
let mut unique_values: Option<BTreeSet<i64>> = if ords { None } else { Some(BTreeSet::new()) };
for &v in all_values {
min = min.min(v);
max = max.max(v);
if gcd != 1 {
if !(i64::MIN / 2..=i64::MAX / 2).contains(&v) {
gcd = 1;
} else {
gcd = gcd_compute(gcd, v - first_value);
}
}
if let Some(ref mut set) = unique_values {
set.insert(v);
if set.len() > 256 {
unique_values = None;
}
}
}
if ords && num_values > 0 {
assert!(min == 0, "min value for ordinals should be 0, got {}", min);
if max != 0 {
assert!(gcd == 1, "GCD on ordinals should be 1, got {}", gcd);
}
}
let num_bits_per_value: u32;
let mut encode_table: Option<HashMap<i64, i64>> = None;
if min >= max {
num_bits_per_value = 0;
meta.write_le_int(-1)?; } else if let Some(ref uv) = unique_values {
if uv.len() > 1 {
let table_bpv = unsigned_bits_required(uv.len() as i64 - 1);
let delta_bpv = unsigned_bits_required((max - min) / gcd);
if table_bpv < delta_bpv {
num_bits_per_value = table_bpv;
let sorted_unique: Vec<i64> = uv.iter().copied().collect();
meta.write_le_int(sorted_unique.len() as i32)?; for &v in &sorted_unique {
meta.write_le_long(v)?;
}
let mut enc = HashMap::new();
for (i, &v) in sorted_unique.iter().enumerate() {
enc.insert(v, i as i64);
}
encode_table = Some(enc);
min = 0;
gcd = 1;
} else {
num_bits_per_value = delta_bpv;
meta.write_le_int(-1)?; if gcd == 1
&& min > 0
&& unsigned_bits_required(max) == unsigned_bits_required(max - min)
{
min = 0;
}
}
} else {
num_bits_per_value = unsigned_bits_required((max - min) / gcd);
meta.write_le_int(-1)?;
}
} else {
num_bits_per_value = unsigned_bits_required((max - min) / gcd);
meta.write_le_int(-1)?; if gcd == 1 && min > 0 && unsigned_bits_required(max) == unsigned_bits_required(max - min) {
min = 0;
}
}
meta.write_byte(num_bits_per_value as u8)?;
meta.write_le_long(min)?;
meta.write_le_long(gcd)?;
let start_offset = data.file_pointer() as i64;
meta.write_le_long(start_offset)?;
if num_bits_per_value > 0 {
let mut writer = DirectWriter::new(num_bits_per_value);
if let Some(ref enc) = encode_table {
for &v in all_values {
writer.add(enc[&v]);
}
} else {
for &v in all_values {
writer.add((v - min) / gcd);
}
}
writer.finish(data)?;
}
let values_length = data.file_pointer() as i64 - start_offset;
meta.write_le_long(values_length)?; meta.write_le_long(-1)?;
Ok((num_docs_with_value, num_values))
}
fn gcd_compute(a: i64, b: i64) -> i64 {
let mut a = if a < 0 { a.wrapping_neg() } else { a };
let mut b = if b < 0 { b.wrapping_neg() } else { b };
if a == 0 {
return b;
}
if b == 0 {
return a;
}
let shift = (a | b).trailing_zeros();
a >>= a.trailing_zeros();
loop {
b >>= b.trailing_zeros();
if a == b {
break;
}
if (a as u64) > (b as u64) || a == i64::MIN {
std::mem::swap(&mut a, &mut b);
}
if a == 1 {
break;
}
b -= a;
}
a << shift
}
fn add_terms_dict(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
sorted_terms: &[BytesRef],
) -> io::Result<()> {
let size = sorted_terms.len() as i64;
meta.write_vlong(size)?;
let block_mask = TERMS_DICT_BLOCK_LZ4_MASK;
meta.write_le_int(DIRECT_MONOTONIC_BLOCK_SHIFT as i32)?;
let mut address_buffer = MemoryIndexOutput::new("temp_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut previous: &[u8] = &[];
let mut max_length: i32 = 0;
let mut max_block_length: i32 = 0;
let start = data.file_pointer() as i64;
let mut suffix_buffer: Vec<u8> = Vec::new();
let mut dict_bytes: Vec<u8> = Vec::new();
let mut lz4_ht = FastHashTable::new();
for (ord, term) in sorted_terms.iter().enumerate() {
if (ord & block_mask) == 0 {
if ord != 0 {
let uncompressed_length =
compress_and_write_terms_block(data, &dict_bytes, &suffix_buffer, &mut lz4_ht)?;
max_block_length = max_block_length.max(uncompressed_length as i32);
suffix_buffer.clear();
}
dm_writer.add((data.file_pointer() as i64) - start);
data.write_vint(term.bytes.len() as i32)?;
data.write_bytes(&term.bytes)?;
dict_bytes = term.bytes.clone();
} else {
let prefix_length = string_helper::bytes_difference(previous, &term.bytes);
let suffix_length = term.bytes.len() - prefix_length;
assert!(suffix_length > 0, "duplicate terms in sorted set");
let byte = (prefix_length.min(15) as u8)
| ((suffix_length.saturating_sub(1).min(15) as u8) << 4);
suffix_buffer.push(byte);
if prefix_length >= 15 {
VecOutput(&mut suffix_buffer).write_vint((prefix_length - 15) as i32)?;
}
if suffix_length >= 16 {
VecOutput(&mut suffix_buffer).write_vint((suffix_length - 16) as i32)?;
}
suffix_buffer.extend_from_slice(&term.bytes[prefix_length..]);
}
max_length = max_length.max(term.bytes.len() as i32);
previous = &term.bytes;
}
if !suffix_buffer.is_empty() {
let uncompressed_length =
compress_and_write_terms_block(data, &dict_bytes, &suffix_buffer, &mut lz4_ht)?;
max_block_length = max_block_length.max(uncompressed_length as i32);
}
dm_writer.finish(meta, &mut address_buffer)?;
meta.write_le_int(max_length)?;
meta.write_le_int(max_block_length)?;
meta.write_le_long(start)?;
meta.write_le_long(data.file_pointer() as i64 - start)?;
let addr_start = data.file_pointer() as i64;
data.write_bytes(address_buffer.bytes())?;
meta.write_le_long(addr_start)?;
meta.write_le_long(data.file_pointer() as i64 - addr_start)?;
write_terms_index(meta, data, sorted_terms)?;
Ok(())
}
fn compress_and_write_terms_block(
data: &mut dyn DataOutput,
dict_bytes: &[u8],
suffix_buffer: &[u8],
ht: &mut FastHashTable,
) -> io::Result<usize> {
let uncompressed_length = suffix_buffer.len();
data.write_vint(uncompressed_length as i32)?;
let mut combined = Vec::with_capacity(dict_bytes.len() + suffix_buffer.len());
combined.extend_from_slice(dict_bytes);
combined.extend_from_slice(suffix_buffer);
let compressed = lz4::compress_with_dictionary_reuse(&combined, dict_bytes.len(), ht);
data.write_bytes(&compressed)?;
Ok(uncompressed_length)
}
fn write_terms_index(
meta: &mut dyn IndexOutput,
data: &mut dyn IndexOutput,
sorted_terms: &[BytesRef],
) -> io::Result<()> {
meta.write_le_int(TERMS_DICT_REVERSE_INDEX_SHIFT)?;
let start = data.file_pointer() as i64;
let mut address_buffer = MemoryIndexOutput::new("temp_reverse_addr".to_string());
let mut dm_writer = DirectMonotonicWriter::new(DIRECT_MONOTONIC_BLOCK_SHIFT);
let mut previous: Option<&[u8]> = None;
let mut offset: i64 = 0;
for (ord, term) in sorted_terms.iter().enumerate() {
if (ord & TERMS_DICT_REVERSE_INDEX_MASK) == 0 {
dm_writer.add(offset);
let sort_key_len = if ord == 0 {
0 } else {
string_helper::sort_key_length(previous.unwrap(), &term.bytes)
};
offset += sort_key_len as i64;
data.write_bytes(&term.bytes[..sort_key_len])?;
}
if (ord & TERMS_DICT_REVERSE_INDEX_MASK) == TERMS_DICT_REVERSE_INDEX_MASK {
previous = Some(&term.bytes);
}
}
dm_writer.add(offset);
dm_writer.finish(meta, &mut address_buffer)?;
meta.write_le_long(start)?;
meta.write_le_long(data.file_pointer() as i64 - start)?;
let addr_start = data.file_pointer() as i64;
data.write_bytes(address_buffer.bytes())?;
meta.write_le_long(addr_start)?;
meta.write_le_long(data.file_pointer() as i64 - addr_start)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codecs::codec_util::{FOOTER_LENGTH, index_header_length};
use crate::document::{DocValuesType, IndexOptions};
use crate::index::indexing_chain::{DocValuesAccumulator, PerFieldData};
use crate::index::{FieldInfo, FieldInfos};
use crate::store::{MemoryDirectory, SharedDirectory};
use crate::test_util::{self, TestDataReader};
use assertables::{assert_ge, assert_gt};
use std::collections::HashMap;
fn make_test_directory() -> SharedDirectory {
SharedDirectory::new(Box::new(MemoryDirectory::new()))
}
fn make_field_info(name: &str, number: u32, dv_type: DocValuesType) -> FieldInfo {
test_util::make_field_info(name, number, true, IndexOptions::None, dv_type)
}
fn make_per_field_data_sorted_numeric(values: Vec<(i32, Vec<i64>)>) -> PerFieldData {
let mut pfd = PerFieldData::new();
pfd.doc_values = DocValuesAccumulator::SortedNumeric(values);
pfd
}
fn make_per_field_data_sorted_set(values: Vec<(i32, Vec<BytesRef>)>) -> PerFieldData {
let mut pfd = PerFieldData::new();
pfd.doc_values = DocValuesAccumulator::SortedSet(values);
pfd
}
fn make_per_field_data_numeric(values: Vec<(i32, i64)>) -> PerFieldData {
let mut pfd = PerFieldData::new();
pfd.doc_values = DocValuesAccumulator::Numeric(values);
pfd
}
fn make_per_field_data_binary(values: Vec<(i32, Vec<u8>)>) -> PerFieldData {
let mut pfd = PerFieldData::new();
pfd.doc_values = DocValuesAccumulator::Binary(values);
pfd
}
fn make_per_field_data_sorted(values: Vec<(i32, BytesRef)>) -> PerFieldData {
let mut pfd = PerFieldData::new();
pfd.doc_values = DocValuesAccumulator::Sorted(values);
pfd
}
#[test]
fn test_gcd_compute() {
assert_eq!(gcd_compute(0, 5), 5);
assert_eq!(gcd_compute(5, 0), 5);
assert_eq!(gcd_compute(12, 8), 4);
assert_eq!(gcd_compute(0, 0), 0);
assert_eq!(gcd_compute(7, 7), 7);
assert_eq!(gcd_compute(100, 75), 25);
assert_eq!(gcd_compute(-12, 8), 4);
}
#[test]
fn test_vec_output_vint() {
let mut buf = Vec::new();
VecOutput(&mut buf).write_vint(0).unwrap();
assert_eq!(buf, vec![0]);
let mut buf = Vec::new();
VecOutput(&mut buf).write_vint(127).unwrap();
assert_eq!(buf, vec![0x7F]);
let mut buf = Vec::new();
VecOutput(&mut buf).write_vint(128).unwrap();
assert_eq!(buf, vec![0x80, 0x01]);
}
#[test]
fn test_sorted_numeric_constant() {
let fi = make_field_info("modified", 1, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"modified".to_string(),
make_per_field_data_sorted_numeric(vec![(0, vec![42]), (1, vec![42]), (2, vec![42])]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
assert_len_eq_x!(&result, 2);
assert_eq!(result[0], "_0_Lucene90_0.dvm");
assert_eq!(result[1], "_0_Lucene90_0.dvd");
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
assert_eq!(&dvm[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &1i32.to_le_bytes());
assert_eq!(entry[4], SORTED_NUMERIC);
assert_eq!(&entry[5..13], &(-1i64).to_le_bytes());
assert_eq!(&entry[13..21], &0i64.to_le_bytes());
assert_eq!(&entry[21..23], &(-1i16).to_le_bytes());
assert_eq!(entry[23], 0xFF);
assert_eq!(&entry[24..32], &3i64.to_le_bytes());
assert_eq!(&entry[32..36], &(-1i32).to_le_bytes());
assert_eq!(entry[36], 0);
assert_eq!(&entry[37..45], &42i64.to_le_bytes());
let footer_start = dvm.len() - FOOTER_LENGTH;
assert_eq!(
&dvm[footer_start..footer_start + 4],
&[0xc0, 0x28, 0x93, 0xe8]
);
}
#[test]
fn test_sorted_numeric_different() {
let fi = make_field_info("modified", 1, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"modified".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![1000]),
(1, vec![2000]),
(2, vec![3000]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let dvd = directory.lock().unwrap().read_file(&result[1]).unwrap();
assert_not_empty!(dvm);
assert_not_empty!(dvd);
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &1i32.to_le_bytes());
assert_eq!(entry[4], SORTED_NUMERIC);
assert_eq!(&entry[24..32], &3i64.to_le_bytes());
let gcd_offset = 37 + 8; let gcd_val = i64::from_le_bytes(entry[gcd_offset..gcd_offset + 8].try_into().unwrap());
assert_eq!(gcd_val, 1000);
}
#[test]
fn test_sorted_set_single_valued() {
let fi = make_field_info("path", 0, DocValuesType::SortedSet);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"path".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("/a.txt")]),
(1, vec![BytesRef::from_utf8("/b.txt")]),
(2, vec![BytesRef::from_utf8("/c.txt")]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let dvd = directory.lock().unwrap().read_file(&result[1]).unwrap();
assert_not_empty!(dvm);
assert_not_empty!(dvd);
assert_eq!(&dvm[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
assert_eq!(&dvd[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], SORTED_SET);
assert_eq!(entry[5], 0);
let footer_start = dvm.len() - FOOTER_LENGTH;
assert_eq!(
&dvm[footer_start..footer_start + 4],
&[0xc0, 0x28, 0x93, 0xe8]
);
}
#[test]
fn test_header_footer_eof() {
let fi = make_field_info("modified", 0, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"modified".to_string(),
make_per_field_data_sorted_numeric(vec![(0, vec![1])]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"",
&segment_id,
&field_infos,
&per_field,
1,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let eof_pos = dvm.len() - FOOTER_LENGTH - 4;
assert_eq!(&dvm[eof_pos..eof_pos + 4], &(-1i32).to_le_bytes());
}
#[test]
fn test_terms_dict_prefix_compression_byte() {
let prefix_len = 3usize;
let suffix_len = 5usize;
let byte = (prefix_len.min(15) as u8) | ((suffix_len.saturating_sub(1).min(15) as u8) << 4);
assert_eq!(byte, 0x43);
let byte2 = 15u8;
assert_eq!(byte2, 0x0F);
}
#[test]
fn test_two_fields_combined() {
let fi_path = make_field_info("path", 0, DocValuesType::SortedSet);
let fi_mod = make_field_info("modified", 1, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi_path, fi_mod]);
let mut per_field = HashMap::new();
per_field.insert(
"path".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("/a.txt")]),
(1, vec![BytesRef::from_utf8("/b.txt")]),
(2, vec![BytesRef::from_utf8("/c.txt")]),
]),
);
per_field.insert(
"modified".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![1000]),
(1, vec![2000]),
(2, vec![3000]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
assert_len_eq_x!(&result, 2);
assert_eq!(result[0], "_0_Lucene90_0.dvm");
assert_eq!(result[1], "_0_Lucene90_0.dvd");
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes()); assert_eq!(entry[4], SORTED_SET);
let eof_pos = dvm.len() - FOOTER_LENGTH - 4;
assert_eq!(&dvm[eof_pos..eof_pos + 4], &(-1i32).to_le_bytes());
}
struct DvmReader<'a>(TestDataReader<'a>);
impl<'a> DvmReader<'a> {
fn new(data: &'a [u8], start: usize) -> Self {
Self(TestDataReader::new(data, start))
}
fn read_numeric(&mut self) -> i64 {
let _docs_with_field_offset = self.0.read_le_long();
let _docs_with_field_length = self.0.read_le_long();
let _jump_table_entry_count = self.0.read_le_short();
let _dense_rank_power = self.0.read_byte();
let num_values = self.0.read_le_long();
let table_size = self.0.read_le_int();
if table_size >= 0 {
for _ in 0..table_size {
let _table_val = self.0.read_le_long();
}
}
let _bits_per_value = self.0.read_byte();
let _min_value = self.0.read_le_long();
let _gcd = self.0.read_le_long();
let _values_offset = self.0.read_le_long();
let _values_length = self.0.read_le_long();
let _value_jump_table_offset = self.0.read_le_long();
num_values
}
fn read_dm_meta(&mut self, num_values: i64, _block_shift: i32) {
let num_blocks = (num_values + (1 << 16) - 1) / (1 << 16);
for _ in 0..num_blocks {
let _min = self.0.read_le_long();
let _avg_inc = self.0.read_le_int(); let _offset = self.0.read_le_long();
let _bits_required = self.0.read_byte();
}
}
fn read_term_dict(&mut self) {
let terms_dict_size = self.0.read_vlong();
let block_shift = self.0.read_le_int();
let addresses_size = (terms_dict_size + (1i64 << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1)
>> TERMS_DICT_BLOCK_LZ4_SHIFT;
self.read_dm_meta(addresses_size, block_shift);
let _max_term_length = self.0.read_le_int();
let _max_block_length = self.0.read_le_int();
let _terms_data_offset = self.0.read_le_long();
let _terms_data_length = self.0.read_le_long();
let _terms_addresses_offset = self.0.read_le_long();
let _terms_addresses_length = self.0.read_le_long();
let terms_dict_index_shift = self.0.read_le_int();
let index_size =
(terms_dict_size + (1i64 << terms_dict_index_shift) - 1) >> terms_dict_index_shift;
self.read_dm_meta(1 + index_size, block_shift);
let _terms_index_offset = self.0.read_le_long();
let _terms_index_length = self.0.read_le_long();
let _terms_index_addresses_offset = self.0.read_le_long();
let _terms_index_addresses_length = self.0.read_le_long();
}
fn read_sorted(&mut self) {
let _ = self.read_numeric();
self.read_term_dict();
}
fn read_binary(&mut self) {
let _data_offset = self.0.read_le_long();
let _data_length = self.0.read_le_long();
let _docs_with_field_offset = self.0.read_le_long();
let _docs_with_field_length = self.0.read_le_long();
let _jump_table_entry_count = self.0.read_le_short();
let _dense_rank_power = self.0.read_byte();
let num_docs_with_field = self.0.read_le_int();
let min_length = self.0.read_le_int();
let max_length = self.0.read_le_int();
if max_length > min_length {
let _addresses_offset = self.0.read_le_long();
let block_shift = self.0.read_vint();
let num_addresses = num_docs_with_field as i64 + 1;
self.read_dm_meta(num_addresses, block_shift);
let _addresses_length = self.0.read_le_long();
}
}
fn read_sorted_set(&mut self) {
let multi_valued = self.0.read_byte();
if multi_valued == 0 {
let _ = self.read_numeric();
self.read_term_dict();
} else {
self.read_sorted_numeric();
self.read_term_dict();
}
}
fn read_sorted_numeric(&mut self) {
let num_values = self.read_numeric();
let num_docs_with_field = self.0.read_le_int();
if num_values > num_docs_with_field as i64 {
let _addresses_offset = self.0.read_le_long();
let block_shift = self.0.read_vint();
let num_addresses = num_docs_with_field as i64 + 1;
self.read_dm_meta(num_addresses, block_shift);
let _addresses_length = self.0.read_le_long();
}
}
}
#[test]
fn test_two_fields_dvm_parseable_like_java() {
let fi_path = make_field_info("path", 0, DocValuesType::SortedSet);
let fi_mod = make_field_info("modified", 1, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi_path, fi_mod]);
let mut per_field = HashMap::new();
per_field.insert(
"path".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("/a.txt")]),
(1, vec![BytesRef::from_utf8("/b.txt")]),
(2, vec![BytesRef::from_utf8("/c.txt")]),
]),
);
per_field.insert(
"modified".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![1000]),
(1, vec![2000]),
(2, vec![3000]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0, "first field number should be 0");
let type0 = reader.0.read_byte();
assert_eq!(type0, SORTED_SET, "first field type should be SORTED_SET");
reader.read_sorted_set();
let field1 = reader.0.read_le_int();
assert_eq!(
field1, 1,
"second field number should be 1 (got {field1} — extra bytes written for SORTED_SET?)"
);
let type1 = reader.0.read_byte();
assert_eq!(
type1, SORTED_NUMERIC,
"second field type should be SORTED_NUMERIC"
);
reader.read_sorted_numeric();
let eof = reader.0.read_le_int();
assert_eq!(eof, -1, "expected EOF marker (-1)");
}
#[test]
fn test_sorted_numeric_with_gcd() {
let fi = make_field_info("field", 0, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"field".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![100]),
(1, vec![200]),
(2, vec![300]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
assert_len_eq_x!(&result, 2);
}
#[test]
fn test_sorted_set_identical_paths() {
let fi = make_field_info("path", 0, DocValuesType::SortedSet);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"path".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("/same.txt")]),
(1, vec![BytesRef::from_utf8("/same.txt")]),
(2, vec![BytesRef::from_utf8("/same.txt")]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
assert_len_eq_x!(&result, 2);
}
#[test]
fn test_numeric_constant() {
let fi = make_field_info("count", 0, DocValuesType::Numeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"count".to_string(),
make_per_field_data_numeric(vec![(0, 42), (1, 42), (2, 42)]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], NUMERIC);
assert_eq!(&entry[5..13], &(-1i64).to_le_bytes());
assert_eq!(&entry[24..32], &3i64.to_le_bytes());
assert_eq!(&entry[32..36], &(-1i32).to_le_bytes());
assert_eq!(entry[36], 0);
assert_eq!(&entry[37..45], &42i64.to_le_bytes());
}
#[test]
fn test_numeric_different_values() {
let fi = make_field_info("score", 0, DocValuesType::Numeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"score".to_string(),
make_per_field_data_numeric(vec![(0, 10), (1, 20), (2, 30)]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let dvd = directory.lock().unwrap().read_file(&result[1]).unwrap();
assert_not_empty!(dvm);
assert_not_empty!(dvd);
}
#[test]
fn test_numeric_no_num_docs_with_field() {
let fi = make_field_info("count", 0, DocValuesType::Numeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"count".to_string(),
make_per_field_data_numeric(vec![(0, 100), (1, 200), (2, 300)]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, NUMERIC);
reader.read_numeric();
let eof = reader.0.read_le_int();
assert_eq!(
eof, -1,
"expected EOF marker — NUMERIC should not write numDocsWithField"
);
}
#[test]
fn test_sorted_field() {
let fi = make_field_info("category", 0, DocValuesType::Sorted);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"category".to_string(),
make_per_field_data_sorted(vec![
(0, BytesRef::from_utf8("alpha")),
(1, BytesRef::from_utf8("beta")),
(2, BytesRef::from_utf8("alpha")),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], SORTED);
}
#[test]
fn test_sorted_parseable() {
let fi = make_field_info("category", 0, DocValuesType::Sorted);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"category".to_string(),
make_per_field_data_sorted(vec![
(0, BytesRef::from_utf8("x")),
(1, BytesRef::from_utf8("y")),
(2, BytesRef::from_utf8("z")),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, SORTED);
reader.read_sorted();
let eof = reader.0.read_le_int();
assert_eq!(eof, -1, "expected EOF marker");
}
#[test]
fn test_binary_fixed_length() {
let fi = make_field_info("hash", 0, DocValuesType::Binary);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"hash".to_string(),
make_per_field_data_binary(vec![
(0, vec![0xAA, 0xBB, 0xCC, 0xDD]),
(1, vec![0x11, 0x22, 0x33, 0x44]),
(2, vec![0xFF, 0xEE, 0xDD, 0xCC]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(entry[4], BINARY);
}
#[test]
fn test_binary_parseable() {
let fi = make_field_info("data", 0, DocValuesType::Binary);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"data".to_string(),
make_per_field_data_binary(vec![
(0, vec![1, 2, 3]),
(1, vec![4, 5, 6]),
(2, vec![7, 8, 9]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, BINARY);
reader.read_binary();
let eof = reader.0.read_le_int();
assert_eq!(eof, -1, "expected EOF marker");
}
#[test]
fn test_binary_variable_length() {
let fi = make_field_info("payload", 0, DocValuesType::Binary);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"payload".to_string(),
make_per_field_data_binary(vec![(0, vec![1]), (1, vec![2, 3, 4]), (2, vec![5, 6])]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
let field0 = reader.0.read_le_int();
assert_eq!(field0, 0);
let type0 = reader.0.read_byte();
assert_eq!(type0, BINARY);
reader.read_binary();
let eof = reader.0.read_le_int();
assert_eq!(
eof, -1,
"expected EOF marker — variable-length binary should be parseable"
);
}
#[test]
fn test_all_dv_types_combined() {
let fi0 = make_field_info("num", 0, DocValuesType::Numeric);
let fi1 = make_field_info("bin", 1, DocValuesType::Binary);
let fi2 = make_field_info("sort", 2, DocValuesType::Sorted);
let fi3 = make_field_info("sortset", 3, DocValuesType::SortedSet);
let fi4 = make_field_info("sortnum", 4, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi0, fi1, fi2, fi3, fi4]);
let mut per_field = HashMap::new();
per_field.insert(
"num".to_string(),
make_per_field_data_numeric(vec![(0, 10), (1, 20), (2, 30)]),
);
per_field.insert(
"bin".to_string(),
make_per_field_data_binary(vec![(0, vec![0xAA]), (1, vec![0xBB]), (2, vec![0xCC])]),
);
per_field.insert(
"sort".to_string(),
make_per_field_data_sorted(vec![
(0, BytesRef::from_utf8("a")),
(1, BytesRef::from_utf8("b")),
(2, BytesRef::from_utf8("c")),
]),
);
per_field.insert(
"sortset".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("x")]),
(1, vec![BytesRef::from_utf8("y")]),
(2, vec![BytesRef::from_utf8("z")]),
]),
);
per_field.insert(
"sortnum".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![100]),
(1, vec![200]),
(2, vec![300]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), NUMERIC);
reader.read_numeric();
assert_eq!(reader.0.read_le_int(), 1);
assert_eq!(reader.0.read_byte(), BINARY);
reader.read_binary();
assert_eq!(reader.0.read_le_int(), 2);
assert_eq!(reader.0.read_byte(), SORTED);
reader.read_sorted();
assert_eq!(reader.0.read_le_int(), 3);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), 4);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_numeric_multi_valued() {
let fi = make_field_info("tags", 0, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"tags".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![100]),
(1, vec![300, 200]), (2, vec![600, 400, 500]), ]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
assert_len_eq_x!(&result, 2);
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_numeric_multi_valued_values_sorted() {
let fi = make_field_info("nums", 0, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"nums".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![100]),
(1, vec![300, 200]),
(2, vec![600, 400, 500]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let entry = &dvm[meta_header_len..];
let num_values = i64::from_le_bytes(entry[24..32].try_into().unwrap());
assert_eq!(num_values, 6, "total values across all docs should be 6");
}
#[test]
fn test_two_fields_multi_valued_sorted_numeric_dvm_parseable() {
let fi_path = make_field_info("path", 0, DocValuesType::SortedSet);
let fi_counts = make_field_info("counts", 1, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi_path, fi_counts]);
let mut per_field = HashMap::new();
per_field.insert(
"path".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("/a.txt")]),
(1, vec![BytesRef::from_utf8("/b.txt")]),
(2, vec![BytesRef::from_utf8("/c.txt")]),
]),
);
per_field.insert(
"counts".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![10, 20]),
(1, vec![30]),
(2, vec![40, 50, 60]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), 1);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_set_multi_valued() {
let fi = make_field_info("tags", 0, DocValuesType::SortedSet);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"tags".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("alpha")]),
(
1,
vec![BytesRef::from_utf8("gamma"), BytesRef::from_utf8("beta")],
),
(
2,
vec![
BytesRef::from_utf8("delta"),
BytesRef::from_utf8("alpha"),
BytesRef::from_utf8("gamma"),
],
),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
assert_len_eq_x!(&result, 2);
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sorted_set_multi_valued_dedup() {
let fi = make_field_info("tags", 0, DocValuesType::SortedSet);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"tags".to_string(),
make_per_field_data_sorted_set(vec![
(
0,
vec![
BytesRef::from_utf8("alpha"),
BytesRef::from_utf8("alpha"),
BytesRef::from_utf8("beta"),
],
),
(
1,
vec![BytesRef::from_utf8("beta"), BytesRef::from_utf8("beta")],
),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
2,
)
.unwrap();
assert_len_eq_x!(&result, 2);
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_two_fields_multi_valued_sorted_set_and_sorted_numeric() {
let fi_tags = make_field_info("tags", 0, DocValuesType::SortedSet);
let fi_nums = make_field_info("nums", 1, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi_tags, fi_nums]);
let mut per_field = HashMap::new();
per_field.insert(
"tags".to_string(),
make_per_field_data_sorted_set(vec![
(0, vec![BytesRef::from_utf8("a"), BytesRef::from_utf8("b")]),
(1, vec![BytesRef::from_utf8("c")]),
(2, vec![BytesRef::from_utf8("a"), BytesRef::from_utf8("c")]),
]),
);
per_field.insert(
"nums".to_string(),
make_per_field_data_sorted_numeric(vec![
(0, vec![10, 20]),
(1, vec![30]),
(2, vec![40, 50, 60]),
]),
);
let segment_id = [0u8; 16];
let directory = make_test_directory();
let result = write(
&directory,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
3,
)
.unwrap();
let dvm = directory.lock().unwrap().read_file(&result[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "Lucene90_0");
let mut reader = DvmReader::new(&dvm, meta_header_len);
assert_eq!(reader.0.read_le_int(), 0);
assert_eq!(reader.0.read_byte(), SORTED_SET);
reader.read_sorted_set();
assert_eq!(reader.0.read_le_int(), 1);
assert_eq!(reader.0.read_byte(), SORTED_NUMERIC);
reader.read_sorted_numeric();
assert_eq!(reader.0.read_le_int(), -1);
}
#[test]
fn test_sparse_numeric_field() {
let fi = make_field_info("score", 0, DocValuesType::Numeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"score".to_string(),
make_per_field_data_numeric(vec![(1, 100), (5, 200), (8, 300)]),
);
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 10).unwrap();
let dvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let dvd = dir.lock().unwrap().read_file(&names[1]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), NUMERIC);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let jump_table_entry_count = reader.read_le_short();
assert_ge!(jump_table_entry_count, 0);
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
let data_header_len = index_header_length(DATA_CODEC, "");
let disi_start = docs_with_field_offset as usize;
assert_ge!(disi_start, data_header_len);
assert_eq!(
docs_with_field_length as usize,
(disi_start + docs_with_field_length as usize) - disi_start
);
let block_id = i16::from_le_bytes(dvd[disi_start..disi_start + 2].try_into().unwrap());
assert_eq!(block_id, 0);
let card_minus_1 =
i16::from_le_bytes(dvd[disi_start + 2..disi_start + 4].try_into().unwrap());
assert_eq!(card_minus_1, 2);
}
#[test]
fn test_sparse_binary_field() {
let fi = make_field_info("tag", 0, DocValuesType::Binary);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"tag".to_string(),
make_per_field_data_binary(vec![(1, b"hello".to_vec()), (3, b"world".to_vec())]),
);
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 5).unwrap();
let dvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), BINARY);
let _data_offset = reader.read_le_long();
let _data_length = reader.read_le_long();
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let jump_table_entry_count = reader.read_le_short();
assert_ge!(jump_table_entry_count, 0);
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
assert_eq!(reader.read_le_int(), 2);
}
#[test]
fn test_sparse_sorted_field() {
let fi = make_field_info("category", 0, DocValuesType::Sorted);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"category".to_string(),
make_per_field_data_sorted(vec![
(0, BytesRef::new(b"alpha".to_vec())),
(3, BytesRef::new(b"beta".to_vec())),
]),
);
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 5).unwrap();
let dvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), SORTED);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let _jump_table_entry_count = reader.read_le_short();
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
}
#[test]
fn test_sparse_sorted_numeric_field() {
let fi = make_field_info("counts", 0, DocValuesType::SortedNumeric);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"counts".to_string(),
make_per_field_data_sorted_numeric(vec![(1, vec![10, 20]), (4, vec![30])]),
);
let segment_id = [0u8; 16];
let dir = make_test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 5).unwrap();
let dvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&dvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
assert_eq!(reader.read_byte(), SORTED_NUMERIC);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let _jump_table_entry_count = reader.read_le_short();
let dense_rank_power = reader.read_byte();
assert_eq!(dense_rank_power, 9);
}
}