use std::collections::HashMap;
use std::io;
use log::debug;
use crate::codecs::codec_util;
use crate::codecs::lucene90::indexed_disi;
use crate::index::FieldInfos;
use crate::index::index_file_names;
use crate::index::indexing_chain::PerFieldData;
use crate::store::{DataOutput, SharedDirectory};
pub(crate) const DATA_EXTENSION: &str = "nvd";
pub(crate) const META_EXTENSION: &str = "nvm";
pub(crate) const DATA_CODEC: &str = "Lucene90NormsData";
pub(crate) const META_CODEC: &str = "Lucene90NormsMetadata";
pub(crate) const VERSION: i32 = 0;
pub fn write(
directory: &SharedDirectory,
segment_name: &str,
segment_suffix: &str,
segment_id: &[u8; 16],
field_infos: &FieldInfos,
per_field: &HashMap<String, PerFieldData>,
num_docs: i32,
) -> io::Result<Vec<String>> {
let nvm_name =
index_file_names::segment_file_name(segment_name, segment_suffix, META_EXTENSION);
let nvd_name =
index_file_names::segment_file_name(segment_name, segment_suffix, DATA_EXTENSION);
let (mut nvm, mut nvd) = {
let mut dir = directory.lock().unwrap();
(dir.create_output(&nvm_name)?, dir.create_output(&nvd_name)?)
};
codec_util::write_index_header(&mut *nvm, META_CODEC, VERSION, segment_id, segment_suffix)?;
codec_util::write_index_header(&mut *nvd, DATA_CODEC, VERSION, segment_id, segment_suffix)?;
for fi in field_infos.iter() {
if !fi.has_norms() {
continue;
}
let Some(pfd) = per_field.get(fi.name()) else {
write_empty_norms_metadata(&mut *nvm, fi.number())?;
continue;
};
let (norms, norms_docs) = (&pfd.norms, &pfd.norms_docs);
let num_docs_with_value = norms_docs.len() as i32;
if num_docs_with_value == 0 {
debug!(
"norms: field={:?} (#{}) -> EMPTY pattern",
fi.name(),
fi.number()
);
write_empty_norms_metadata(&mut *nvm, fi.number())?;
continue;
}
let min = *norms.iter().min().unwrap();
let max = *norms.iter().max().unwrap();
let bytes_per_norm = num_bytes_per_value(min, max);
if num_docs_with_value == num_docs {
debug!(
"norms: field={:?} (#{}) -> ALL pattern, bytes_per_norm={}, min={}, max={}, num_docs_with_field={}",
fi.name(),
fi.number(),
bytes_per_norm,
min,
max,
num_docs_with_value
);
nvm.write_le_int(fi.number() as i32)?; nvm.write_le_long(-1)?; nvm.write_le_long(0)?; nvm.write_le_short(-1)?; nvm.write_byte(0xFF)?; nvm.write_le_int(num_docs_with_value)?;
if bytes_per_norm == 0 {
nvm.write_byte(0)?; nvm.write_le_long(min)?; } else {
nvm.write_byte(bytes_per_norm)?;
let data_offset = nvd.file_pointer() as i64;
nvm.write_le_long(data_offset)?;
write_norm_values(&mut *nvd, norms, bytes_per_norm)?;
}
} else {
debug!(
"norms: field={:?} (#{}) -> SPARSE pattern, bytes_per_norm={}, min={}, max={}, num_docs_with_field={}/{}",
fi.name(),
fi.number(),
bytes_per_norm,
min,
max,
num_docs_with_value,
num_docs
);
nvm.write_le_int(fi.number() as i32)?;
let disi_offset = nvd.file_pointer() as i64;
nvm.write_le_long(disi_offset)?;
let jump_table_entry_count =
indexed_disi::write_bit_set(norms_docs, num_docs, &mut *nvd)?;
nvm.write_le_long(nvd.file_pointer() as i64 - disi_offset)?;
nvm.write_le_short(jump_table_entry_count)?;
nvm.write_byte(indexed_disi::DEFAULT_DENSE_RANK_POWER as u8)?;
nvm.write_le_int(num_docs_with_value)?;
if bytes_per_norm == 0 {
nvm.write_byte(0)?;
nvm.write_le_long(min)?;
} else {
nvm.write_byte(bytes_per_norm)?;
let data_offset = nvd.file_pointer() as i64;
nvm.write_le_long(data_offset)?;
write_norm_values(&mut *nvd, norms, bytes_per_norm)?;
}
}
}
nvm.write_le_int(-1)?;
codec_util::write_footer(&mut *nvm)?;
codec_util::write_footer(&mut *nvd)?;
Ok(vec![nvm_name, nvd_name])
}
fn write_empty_norms_metadata(nvm: &mut dyn DataOutput, field_number: u32) -> io::Result<()> {
nvm.write_le_int(field_number as i32)?; nvm.write_le_long(-2)?; nvm.write_le_long(0)?; nvm.write_le_short(-1)?; nvm.write_byte(0xFF)?; nvm.write_le_int(0)?; nvm.write_byte(0)?; nvm.write_le_long(0)?; Ok(())
}
fn num_bytes_per_value(min: i64, max: i64) -> u8 {
if min >= max {
0
} else if min >= -128 && max <= 127 {
1
} else if min >= -32768 && max <= 32767 {
2
} else if min >= i32::MIN as i64 && max <= i32::MAX as i64 {
4
} else {
8
}
}
fn write_norm_values(
nvd: &mut dyn DataOutput,
norms: &[i64],
bytes_per_norm: u8,
) -> io::Result<()> {
for &norm in norms {
match bytes_per_norm {
1 => nvd.write_byte(norm as u8)?,
2 => nvd.write_le_short(norm as i16)?,
4 => nvd.write_le_int(norm as i32)?,
8 => nvd.write_le_long(norm)?,
_ => unreachable!("invalid bytes_per_norm: {}", bytes_per_norm),
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codecs::codec_util::{FOOTER_LENGTH, index_header_length};
use crate::document::{DocValuesType, IndexOptions};
use crate::index::indexing_chain::PerFieldData;
use crate::index::{FieldInfo, FieldInfos};
use crate::store::{MemoryDirectory, SharedDirectory};
use crate::test_util::{self, TestDataReader};
use assertables::{assert_ge, assert_gt};
use std::collections::HashMap;
fn make_field_info(name: &str, number: u32, has_norms: bool) -> FieldInfo {
test_util::make_field_info(
name,
number,
!has_norms,
IndexOptions::DocsAndFreqsAndPositions,
DocValuesType::None,
)
}
fn make_per_field_data(norms: Vec<i64>, norms_docs: Vec<i32>) -> PerFieldData {
let mut pfd = PerFieldData::new();
pfd.norms = norms;
pfd.norms_docs = norms_docs;
pfd
}
fn test_directory() -> SharedDirectory {
SharedDirectory::new(Box::new(MemoryDirectory::new()))
}
const META_ENTRY_SIZE: usize = 36;
#[test]
fn test_all_pattern_1byte_norms() {
let fi = make_field_info("contents", 2, true);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"contents".to_string(),
make_per_field_data(vec![12, 8, 10], vec![0, 1, 2]),
);
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 3).unwrap();
assert_len_eq_x!(&names, 2);
assert_eq!(names[0], "_0.nvm");
assert_eq!(names[1], "_0.nvd");
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let nvd = dir.lock().unwrap().read_file(&names[1]).unwrap();
assert_eq!(&nvm[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
assert_eq!(&nvd[0..4], &[0x3f, 0xd7, 0x6c, 0x17]);
let meta_header_len = index_header_length(META_CODEC, "");
let entry = &nvm[meta_header_len..];
assert_eq!(&entry[0..4], &2i32.to_le_bytes());
assert_eq!(&entry[4..12], &(-1i64).to_le_bytes());
assert_eq!(&entry[12..20], &0i64.to_le_bytes());
assert_eq!(&entry[20..22], &(-1i16).to_le_bytes());
assert_eq!(entry[22], 0xFF);
assert_eq!(&entry[23..27], &3i32.to_le_bytes());
assert_eq!(entry[27], 1);
let data_header_len = index_header_length(DATA_CODEC, "");
let expected_offset = data_header_len as i64;
assert_eq!(&entry[28..36], &expected_offset.to_le_bytes());
assert_eq!(
&nvm[meta_header_len + META_ENTRY_SIZE..meta_header_len + META_ENTRY_SIZE + 4],
&(-1i32).to_le_bytes()
);
let nvm_footer_start = nvm.len() - FOOTER_LENGTH;
assert_eq!(
&nvm[nvm_footer_start..nvm_footer_start + 4],
&[0xc0, 0x28, 0x93, 0xe8]
);
assert_eq!(nvd[data_header_len], 12u8); assert_eq!(nvd[data_header_len + 1], 8u8); assert_eq!(nvd[data_header_len + 2], 10u8);
let nvd_footer_start = nvd.len() - FOOTER_LENGTH;
assert_eq!(
&nvd[nvd_footer_start..nvd_footer_start + 4],
&[0xc0, 0x28, 0x93, 0xe8]
);
}
#[test]
fn test_empty_pattern() {
let fi = make_field_info("contents", 0, true);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert("contents".to_string(), make_per_field_data(vec![], vec![]));
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 3).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let entry = &nvm[meta_header_len..];
assert_eq!(&entry[0..4], &0i32.to_le_bytes());
assert_eq!(&entry[4..12], &(-2i64).to_le_bytes());
assert_eq!(&entry[23..27], &0i32.to_le_bytes());
assert_eq!(entry[27], 0);
}
#[test]
fn test_empty_pattern_field_not_in_per_field() {
let fi = make_field_info("missing", 0, true);
let field_infos = FieldInfos::new(vec![fi]);
let per_field = HashMap::new();
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 3).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let entry = &nvm[meta_header_len..];
assert_eq!(&entry[4..12], &(-2i64).to_le_bytes());
}
#[test]
fn test_constant_norms() {
let fi = make_field_info("contents", 1, true);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"contents".to_string(),
make_per_field_data(vec![12, 12, 12], vec![0, 1, 2]),
);
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 3).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let nvd = dir.lock().unwrap().read_file(&names[1]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let entry = &nvm[meta_header_len..];
assert_eq!(&entry[4..12], &(-1i64).to_le_bytes());
assert_eq!(&entry[23..27], &3i32.to_le_bytes());
assert_eq!(entry[27], 0);
assert_eq!(&entry[28..36], &12i64.to_le_bytes());
let data_header_len = index_header_length(DATA_CODEC, "");
assert_eq!(nvd.len(), data_header_len + FOOTER_LENGTH);
}
#[test]
fn test_no_norms_fields_skipped() {
let fi_path = make_field_info("path", 0, false); let fi_contents = make_field_info("contents", 1, true); let field_infos = FieldInfos::new(vec![fi_path, fi_contents]);
let mut per_field = HashMap::new();
per_field.insert(
"contents".to_string(),
make_per_field_data(vec![12, 8], vec![0, 1]),
);
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 2).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let entry = &nvm[meta_header_len..];
assert_eq!(&entry[0..4], &1i32.to_le_bytes());
assert_eq!(
&nvm[meta_header_len + META_ENTRY_SIZE..meta_header_len + META_ENTRY_SIZE + 4],
&(-1i32).to_le_bytes()
);
}
#[test]
fn test_segment_suffix() {
let fi = make_field_info("f", 0, true);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert("f".to_string(), make_per_field_data(vec![10], vec![0]));
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(
&dir,
"_0",
"Lucene90_0",
&segment_id,
&field_infos,
&per_field,
1,
)
.unwrap();
assert_eq!(names[0], "_0_Lucene90_0.nvm");
assert_eq!(names[1], "_0_Lucene90_0.nvd");
}
#[test]
fn test_num_bytes_per_value() {
assert_eq!(num_bytes_per_value(5, 5), 0);
assert_eq!(num_bytes_per_value(0, 0), 0);
assert_eq!(num_bytes_per_value(0, 127), 1);
assert_eq!(num_bytes_per_value(-128, 127), 1);
assert_eq!(num_bytes_per_value(-128, 0), 1);
assert_eq!(num_bytes_per_value(0, 128), 2);
assert_eq!(num_bytes_per_value(-129, 0), 2);
assert_eq!(num_bytes_per_value(-32768, 32767), 2);
assert_eq!(num_bytes_per_value(0, 32768), 4);
assert_eq!(num_bytes_per_value(i32::MIN as i64, i32::MAX as i64), 4);
assert_eq!(num_bytes_per_value(i32::MIN as i64 - 1, 0), 8);
assert_eq!(num_bytes_per_value(0, i32::MAX as i64 + 1), 8);
}
#[test]
fn test_multiple_fields_with_norms() {
let fi_a = make_field_info("alpha", 0, true);
let fi_b = make_field_info("beta", 1, true);
let field_infos = FieldInfos::new(vec![fi_a, fi_b]);
let mut per_field = HashMap::new();
per_field.insert(
"alpha".to_string(),
make_per_field_data(vec![5, 5, 5], vec![0, 1, 2]),
);
per_field.insert(
"beta".to_string(),
make_per_field_data(vec![10, 20, 30], vec![0, 1, 2]),
);
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 3).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let nvd = dir.lock().unwrap().read_file(&names[1]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let entry0 = &nvm[meta_header_len..];
assert_eq!(&entry0[0..4], &0i32.to_le_bytes()); assert_eq!(entry0[27], 0); assert_eq!(&entry0[28..36], &5i64.to_le_bytes());
let entry1 = &nvm[meta_header_len + META_ENTRY_SIZE..];
assert_eq!(&entry1[0..4], &1i32.to_le_bytes()); assert_eq!(entry1[27], 1);
let data_header_len = index_header_length(DATA_CODEC, "");
assert_eq!(nvd[data_header_len], 10u8);
assert_eq!(nvd[data_header_len + 1], 20u8);
assert_eq!(nvd[data_header_len + 2], 30u8);
assert_eq!(
&nvm[meta_header_len + 2 * META_ENTRY_SIZE..meta_header_len + 2 * META_ENTRY_SIZE + 4],
&(-1i32).to_le_bytes()
);
}
#[test]
fn test_sparse_norms() {
let fi = make_field_info("contents", 0, true);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"contents".to_string(),
make_per_field_data(vec![12, 8], vec![1, 3]),
);
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 5).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let nvd = dir.lock().unwrap().read_file(&names[1]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&nvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let docs_with_field_length = reader.read_le_long();
assert_gt!(docs_with_field_length, 0);
let jump_table_entry_count = reader.read_le_short();
assert_ge!(jump_table_entry_count, 0);
assert_eq!(reader.read_byte(), 9);
assert_eq!(reader.read_le_int(), 2);
assert_eq!(reader.read_byte(), 1);
let norms_offset = reader.read_le_long();
let disi_end = docs_with_field_offset + docs_with_field_length;
assert_eq!(norms_offset, disi_end);
assert_eq!(nvd[norms_offset as usize], 12u8);
assert_eq!(nvd[norms_offset as usize + 1], 8u8);
let disi_start = docs_with_field_offset as usize;
let block_id = i16::from_le_bytes(nvd[disi_start..disi_start + 2].try_into().unwrap());
assert_eq!(block_id, 0);
let card_minus_1 =
i16::from_le_bytes(nvd[disi_start + 2..disi_start + 4].try_into().unwrap());
assert_eq!(card_minus_1, 1); }
#[test]
fn test_sparse_constant_norms() {
let fi = make_field_info("title", 0, true);
let field_infos = FieldInfos::new(vec![fi]);
let mut per_field = HashMap::new();
per_field.insert(
"title".to_string(),
make_per_field_data(vec![42, 42, 42], vec![0, 2, 4]),
);
let segment_id = [0u8; 16];
let dir = test_directory();
let names = write(&dir, "_0", "", &segment_id, &field_infos, &per_field, 5).unwrap();
let nvm = dir.lock().unwrap().read_file(&names[0]).unwrap();
let meta_header_len = index_header_length(META_CODEC, "");
let mut reader = TestDataReader::new(&nvm[meta_header_len..], 0);
assert_eq!(reader.read_le_int(), 0);
let docs_with_field_offset = reader.read_le_long();
assert_ge!(docs_with_field_offset, 0);
let _docs_with_field_length = reader.read_le_long();
let _jump_table_entry_count = reader.read_le_short();
assert_eq!(reader.read_byte(), 9);
assert_eq!(reader.read_le_int(), 3);
assert_eq!(reader.read_byte(), 0);
assert_eq!(reader.read_le_long(), 42);
}
}