use std::collections::HashMap;
use std::io;
use log::debug;
use crate::codecs::codec_util;
use crate::index::SegmentCommitInfo;
use crate::index::index_file_names;
use crate::store::checksum_input::ChecksumIndexInput;
use crate::store::memory::MemoryIndexOutput;
use crate::store::{DataInput, DataOutput, Directory, SegmentFile};
use crate::util::string_helper;
const CODEC_NAME: &str = "segments";
const VERSION_CURRENT: i32 = 10;
const LUCENE_VERSION_MAJOR: i32 = 10;
const LUCENE_VERSION_MINOR: i32 = 3;
const LUCENE_VERSION_BUGFIX: i32 = 2;
const SEGMENT_CODEC_NAME: &str = "Lucene103";
pub fn write(
segments: &[&SegmentCommitInfo],
generation: i64,
version: i64,
counter: i64,
user_data: &HashMap<String, String>,
) -> io::Result<SegmentFile> {
let gen_suffix = index_file_names::radix36(generation as u64);
let filename = format!("segments_{gen_suffix}");
let id = string_helper::random_id();
let mut out = MemoryIndexOutput::new(filename.clone());
codec_util::write_index_header(&mut out, CODEC_NAME, VERSION_CURRENT, &id, &gen_suffix)?;
out.write_vint(LUCENE_VERSION_MAJOR)?;
out.write_vint(LUCENE_VERSION_MINOR)?;
out.write_vint(LUCENE_VERSION_BUGFIX)?;
out.write_vint(LUCENE_VERSION_MAJOR)?;
out.write_be_long(version)?;
out.write_vlong(counter)?;
let num_segments = segments.len() as i32;
out.write_be_int(num_segments)?;
debug!(
"segment_infos: writing segments_{gen_suffix}, version={version}, \
counter={counter}, num_segments={num_segments}"
);
if !segments.is_empty() {
out.write_vint(LUCENE_VERSION_MAJOR)?;
out.write_vint(LUCENE_VERSION_MINOR)?;
out.write_vint(LUCENE_VERSION_BUGFIX)?;
}
for sci in segments {
let si = &sci.info;
out.write_string(&si.name)?;
out.write_bytes(&si.id)?;
out.write_string(SEGMENT_CODEC_NAME)?;
out.write_be_long(sci.del_gen)?;
out.write_be_int(sci.del_count)?;
out.write_be_long(sci.field_infos_gen)?;
out.write_be_long(sci.doc_values_gen)?;
out.write_be_int(sci.soft_del_count)?;
match &sci.id {
Some(sci_id) => {
out.write_byte(1)?;
out.write_bytes(sci_id)?;
}
None => {
out.write_byte(0)?;
}
}
out.write_set_of_strings(&[])?;
out.write_be_int(0)?;
debug!(
"segment_infos: segment={} maxDoc={} compound={} delGen={} delCount={}",
si.name, si.max_doc, si.is_compound_file, sci.del_gen, sci.del_count
);
}
out.write_map_of_strings(user_data)?;
codec_util::write_footer(&mut out)?;
Ok(out.into_inner())
}
#[derive(Debug)]
pub struct SegmentEntry {
pub name: String,
pub id: [u8; codec_util::ID_LENGTH],
pub codec: String,
pub del_gen: i64,
pub del_count: i32,
pub field_infos_gen: i64,
pub doc_values_gen: i64,
pub soft_del_count: i32,
pub sci_id: Option<[u8; codec_util::ID_LENGTH]>,
}
pub struct SegmentInfosRead {
pub segments: Box<[SegmentEntry]>,
pub generation: i64,
pub version: i64,
pub counter: i64,
pub user_data: HashMap<String, String>,
}
pub fn generation_from_segments_file_name(file_name: &str) -> io::Result<i64> {
if file_name == "segments" {
return Ok(0);
}
let suffix = file_name.strip_prefix("segments_").ok_or_else(|| {
io::Error::other(format!("fileName \"{file_name}\" is not a segments file"))
})?;
i64::from_str_radix(suffix, 36)
.map_err(|e| io::Error::other(format!("invalid generation in {file_name}: {e}")))
}
pub fn get_last_commit_segments_file_name(files: &[String]) -> io::Result<String> {
let mut max_generation: i64 = -1;
for file in files {
if file.starts_with("segments_") {
let generation = generation_from_segments_file_name(file)?;
if generation > max_generation {
max_generation = generation;
}
}
}
if max_generation == -1 {
return Err(io::Error::other("no segments_N file found in directory"));
}
let suffix = index_file_names::radix36(max_generation as u64);
Ok(format!("segments_{suffix}"))
}
pub fn read(directory: &dyn Directory, segment_file_name: &str) -> io::Result<SegmentInfosRead> {
let generation = generation_from_segments_file_name(segment_file_name)?;
let expected_suffix = index_file_names::radix36(generation as u64);
let input = directory.open_input(segment_file_name)?;
let mut input = ChecksumIndexInput::new(input);
codec_util::check_header(&mut input, CODEC_NAME, VERSION_CURRENT, VERSION_CURRENT)?;
let mut _id = [0u8; codec_util::ID_LENGTH];
input.read_bytes(&mut _id)?;
let suffix_len = input.read_byte()? as usize;
let mut suffix_bytes = vec![0u8; suffix_len];
input.read_bytes(&mut suffix_bytes)?;
let suffix = String::from_utf8(suffix_bytes).map_err(|e| io::Error::other(e.to_string()))?;
if suffix != expected_suffix {
return Err(io::Error::other(format!(
"segments suffix mismatch: expected {expected_suffix:?}, got {suffix:?}"
)));
}
let _major = input.read_vint()?;
let _minor = input.read_vint()?;
let _bugfix = input.read_vint()?;
let _index_created_version = input.read_vint()?;
let version = input.read_be_long()?;
let counter = input.read_vlong()?;
let num_segments = input.read_be_int()?;
if num_segments < 0 {
return Err(io::Error::other(format!(
"invalid segment count: {num_segments}"
)));
}
if num_segments > 0 {
let _min_major = input.read_vint()?;
let _min_minor = input.read_vint()?;
let _min_bugfix = input.read_vint()?;
}
let mut segments = Vec::with_capacity(num_segments as usize);
for _ in 0..num_segments {
let seg_name = input.read_string()?;
let mut seg_id = [0u8; codec_util::ID_LENGTH];
input.read_bytes(&mut seg_id)?;
let codec_name = input.read_string()?;
let del_gen = input.read_be_long()?;
let del_count = input.read_be_int()?;
let field_infos_gen = input.read_be_long()?;
let doc_values_gen = input.read_be_long()?;
let soft_del_count = input.read_be_int()?;
let sci_id = match input.read_byte()? {
1 => {
let mut id = [0u8; codec_util::ID_LENGTH];
input.read_bytes(&mut id)?;
Some(id)
}
0 => None,
marker => {
return Err(io::Error::other(format!("invalid SCI ID marker: {marker}")));
}
};
let _field_infos_files = input.read_set_of_strings()?;
let num_dv_fields = input.read_be_int()?;
for _ in 0..num_dv_fields {
let _field_number = input.read_be_int()?;
let _files = input.read_set_of_strings()?;
}
segments.push(SegmentEntry {
name: seg_name,
id: seg_id,
codec: codec_name,
del_gen,
del_count,
field_infos_gen,
doc_values_gen,
soft_del_count,
sci_id,
});
}
let user_data = input.read_map_of_strings()?;
codec_util::check_footer(&mut input)?;
debug!(
"segment_infos: read {segment_file_name}, version={version}, \
counter={counter}, num_segments={num_segments}"
);
Ok(SegmentInfosRead {
segments: segments.into_boxed_slice(),
generation,
version,
counter,
user_data,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codecs::codec_util::{CODEC_MAGIC, FOOTER_LENGTH, FOOTER_MAGIC};
use crate::document::{DocValuesType, IndexOptions};
use crate::index::{FieldInfo, FieldInfos, PointDimensionConfig, SegmentInfo};
use crate::test_util::TestDataReader;
fn make_test_segment_commit_info(
name: &str,
max_doc: i32,
segment_id: [u8; 16],
sci_id: Option<[u8; 16]>,
) -> SegmentCommitInfo {
let si = SegmentInfo::new(
name.to_string(),
max_doc,
true,
segment_id,
HashMap::new(),
HashMap::new(),
);
let fis = FieldInfos::new(vec![FieldInfo::new(
"contents".to_string(),
0,
false,
false,
IndexOptions::DocsAndFreqsAndPositions,
DocValuesType::None,
PointDimensionConfig::default(),
)]);
SegmentCommitInfo::new(si, fis, sci_id)
}
#[test]
fn test_write_empty_segments() {
let user_data = HashMap::new();
let file = write(&[], 1, 1, 0, &user_data).unwrap();
assert_eq!(file.name, "segments_1");
let mut r = TestDataReader::new(&file.data, 0);
assert_eq!(r.read_be_int(), CODEC_MAGIC);
r.pos = file.data.len() - FOOTER_LENGTH;
assert_eq!(r.read_be_int(), FOOTER_MAGIC);
assert_eq!(r.read_be_int(), 0);
r.pos = codec_util::index_header_length(CODEC_NAME, "1");
assert_eq!(r.read_vint(), 10);
assert_eq!(r.read_vint(), 3);
assert_eq!(r.read_vint(), 2);
assert_eq!(r.read_vint(), 10);
assert_eq!(r.read_be_long(), 1);
assert_eq!(r.read_vlong(), 0);
assert_eq!(r.read_be_int(), 0);
assert_eq!(r.read_vint(), 0);
assert_eq!(r.pos, file.data.len() - FOOTER_LENGTH);
}
#[test]
fn test_write_single_segment() {
let seg_id = [0xABu8; 16];
let sci_id = [0xCDu8; 16];
let sci = make_test_segment_commit_info("_0", 3, seg_id, Some(sci_id));
let user_data = HashMap::new();
let file = write(&[&sci], 1, 1, 1, &user_data).unwrap();
assert_eq!(file.name, "segments_1");
let mut r =
TestDataReader::new(&file.data, codec_util::index_header_length(CODEC_NAME, "1"));
assert_eq!(r.read_vint(), 10);
assert_eq!(r.read_vint(), 3);
assert_eq!(r.read_vint(), 2);
assert_eq!(r.read_vint(), 10);
assert_eq!(r.read_be_long(), 1);
assert_eq!(r.read_vlong(), 1);
assert_eq!(r.read_be_int(), 1);
assert_eq!(r.read_vint(), 10);
assert_eq!(r.read_vint(), 3);
assert_eq!(r.read_vint(), 2);
let name = r.read_string();
assert_eq!(name, "_0");
assert_eq!(&file.data[r.pos..r.pos + 16], &[0xABu8; 16]);
r.pos += 16;
let codec = r.read_string();
assert_eq!(codec, "Lucene103");
assert_eq!(r.read_be_long(), -1);
assert_eq!(r.read_be_int(), 0);
assert_eq!(r.read_be_long(), -1);
assert_eq!(r.read_be_long(), -1);
assert_eq!(r.read_be_int(), 0);
assert_eq!(file.data[r.pos], 1);
r.pos += 1;
assert_eq!(&file.data[r.pos..r.pos + 16], &[0xCDu8; 16]);
r.pos += 16;
assert_eq!(r.read_vint(), 0);
assert_eq!(r.read_be_int(), 0);
assert_eq!(r.read_vint(), 0);
assert_eq!(r.pos, file.data.len() - FOOTER_LENGTH);
}
#[test]
fn test_write_segment_no_sci_id() {
let seg_id = [0x11u8; 16];
let sci = make_test_segment_commit_info("_0", 1, seg_id, None);
let user_data = HashMap::new();
let file = write(&[&sci], 1, 1, 1, &user_data).unwrap();
let mut r =
TestDataReader::new(&file.data, codec_util::index_header_length(CODEC_NAME, "1"));
for _ in 0..4 {
r.read_vint();
}
r.read_be_long();
r.read_vlong();
r.read_be_int();
for _ in 0..3 {
r.read_vint();
}
r.read_string();
r.pos += 16;
r.read_string();
r.read_be_long();
r.read_be_int();
r.read_be_long();
r.read_be_long();
r.read_be_int();
assert_eq!(file.data[r.pos], 0);
r.pos += 1;
assert_eq!(r.read_vint(), 0);
assert_eq!(r.read_be_int(), 0);
}
#[test]
fn test_read_roundtrip_empty() {
let user_data = HashMap::new();
let file = write(&[], 1, 1, 0, &user_data).unwrap();
let mut dir = crate::store::MemoryDirectory::new();
dir.write_file(&file.name, &file.data).unwrap();
let result = read(&dir, &file.name).unwrap();
assert_is_empty!(&result.segments);
assert_eq!(result.version, 1);
assert_eq!(result.counter, 0);
assert_is_empty!(&result.user_data);
}
#[test]
fn test_read_roundtrip_single_segment() {
let seg_id = [0xABu8; 16];
let sci_id = [0xCDu8; 16];
let sci = make_test_segment_commit_info("_0", 3, seg_id, Some(sci_id));
let user_data = HashMap::new();
let file = write(&[&sci], 1, 1, 1, &user_data).unwrap();
let mut dir = crate::store::MemoryDirectory::new();
dir.write_file(&file.name, &file.data).unwrap();
let result = read(&dir, &file.name).unwrap();
assert_len_eq_x!(&result.segments, 1);
assert_eq!(result.segments[0].name, "_0");
assert_eq!(result.segments[0].id, seg_id);
assert_eq!(result.segments[0].codec, "Lucene103");
assert_eq!(result.segments[0].sci_id, Some(sci_id));
assert_eq!(result.version, 1);
assert_eq!(result.counter, 1);
}
#[test]
fn test_write_generation_suffix() {
let user_data = HashMap::new();
let file = write(&[], 36, 36, 0, &user_data).unwrap();
assert_eq!(file.name, "segments_10");
}
#[test]
fn test_byte_order_correctness() {
let seg_id = [0x00u8; 16];
let sci = make_test_segment_commit_info("_0", 5, seg_id, Some([0x00; 16]));
let user_data = HashMap::new();
let file = write(&[&sci], 1, 0x0102030405060708, 1, &user_data).unwrap();
let mut r =
TestDataReader::new(&file.data, codec_util::index_header_length(CODEC_NAME, "1"));
for _ in 0..4 {
r.read_vint();
}
assert_eq!(file.data[r.pos], 0x01);
assert_eq!(file.data[r.pos + 1], 0x02);
assert_eq!(file.data[r.pos + 2], 0x03);
assert_eq!(file.data[r.pos + 3], 0x04);
assert_eq!(file.data[r.pos + 4], 0x05);
assert_eq!(file.data[r.pos + 5], 0x06);
assert_eq!(file.data[r.pos + 6], 0x07);
assert_eq!(file.data[r.pos + 7], 0x08);
let ver = r.read_be_long();
assert_eq!(ver, 0x0102030405060708);
}
#[test]
fn test_generation_bare_segments() {
assert_eq!(generation_from_segments_file_name("segments").unwrap(), 0);
}
#[test]
fn test_generation_single_digit() {
assert_eq!(generation_from_segments_file_name("segments_1").unwrap(), 1);
assert_eq!(generation_from_segments_file_name("segments_9").unwrap(), 9);
}
#[test]
fn test_generation_base36_letters() {
assert_eq!(
generation_from_segments_file_name("segments_a").unwrap(),
10
);
assert_eq!(
generation_from_segments_file_name("segments_z").unwrap(),
35
);
}
#[test]
fn test_generation_base36_multi_char() {
assert_eq!(
generation_from_segments_file_name("segments_10").unwrap(),
36
);
assert_eq!(
generation_from_segments_file_name("segments_1a").unwrap(),
46
);
}
#[test]
fn test_generation_invalid_filename() {
assert!(generation_from_segments_file_name("_0.cfs").is_err());
assert!(generation_from_segments_file_name("not_segments").is_err());
}
#[test]
fn test_last_commit_single_file() {
let files = vec!["segments_1".to_string()];
assert_eq!(
get_last_commit_segments_file_name(&files).unwrap(),
"segments_1"
);
}
#[test]
fn test_last_commit_numeric_max_not_lexicographic() {
let files = vec!["segments_z".to_string(), "segments_10".to_string()];
assert_eq!(
get_last_commit_segments_file_name(&files).unwrap(),
"segments_10"
);
}
#[test]
fn test_last_commit_ignores_non_segments() {
let files = vec![
"_0.cfs".to_string(),
"_0.si".to_string(),
"segments_3".to_string(),
"write.lock".to_string(),
];
assert_eq!(
get_last_commit_segments_file_name(&files).unwrap(),
"segments_3"
);
}
#[test]
fn test_last_commit_no_segments_files() {
let files = vec!["_0.cfs".to_string(), "write.lock".to_string()];
assert!(get_last_commit_segments_file_name(&files).is_err());
}
#[test]
fn test_last_commit_empty() {
let files: Vec<String> = vec![];
assert!(get_last_commit_segments_file_name(&files).is_err());
}
#[test]
fn test_last_commit_multiple_generations() {
let files = vec![
"segments_1".to_string(),
"segments_5".to_string(),
"segments_3".to_string(),
];
assert_eq!(
get_last_commit_segments_file_name(&files).unwrap(),
"segments_5"
);
}
}