use std::io;
use log::debug;
use crate::codecs::codec_util;
use crate::codecs::lucene90::compressing_stored_fields_reader::FieldsIndexReader;
use crate::codecs::lucene90::term_vectors::{
DATA_CODEC, INDEX_CODEC_IDX, INDEX_CODEC_META, INDEX_EXTENSION, META_EXTENSION,
VECTORS_EXTENSION, VERSION,
};
use crate::index::index_file_names;
use crate::store::checksum_input::ChecksumIndexInput;
use crate::store::{DataInput, Directory, IndexInput};
pub struct CompressingTermVectorsReader {
#[expect(dead_code)]
vectors_stream: Box<dyn IndexInput>,
#[expect(dead_code)]
index_reader: FieldsIndexReader,
#[expect(dead_code)]
version: i32,
#[expect(dead_code)]
packed_ints_version: i32,
#[expect(dead_code)]
chunk_size: i32,
num_chunks: i64,
#[expect(dead_code)]
num_dirty_chunks: i64,
#[expect(dead_code)]
num_dirty_docs: i64,
}
impl CompressingTermVectorsReader {
pub fn open(
directory: &dyn Directory,
segment_name: &str,
segment_suffix: &str,
segment_id: &[u8; codec_util::ID_LENGTH],
) -> io::Result<Self> {
let tvd_name =
index_file_names::segment_file_name(segment_name, segment_suffix, VECTORS_EXTENSION);
let mut vectors_stream = directory.open_input(&tvd_name)?;
let version = codec_util::check_index_header(
vectors_stream.as_mut(),
DATA_CODEC,
VERSION,
VERSION,
segment_id,
segment_suffix,
)?;
let tvm_name =
index_file_names::segment_file_name(segment_name, segment_suffix, META_EXTENSION);
let meta_input = directory.open_input(&tvm_name)?;
let mut meta_in = ChecksumIndexInput::new(meta_input);
codec_util::check_index_header(
&mut meta_in,
INDEX_CODEC_META,
VERSION,
VERSION,
segment_id,
segment_suffix,
)?;
let packed_ints_version = meta_in.read_vint()?;
let chunk_size = meta_in.read_vint()?;
codec_util::retrieve_checksum(vectors_stream.as_mut())?;
let tvx_name =
index_file_names::segment_file_name(segment_name, segment_suffix, INDEX_EXTENSION);
let mut tvx_input = directory.open_input(&tvx_name)?;
codec_util::check_index_header(
tvx_input.as_mut(),
INDEX_CODEC_IDX,
VERSION,
VERSION,
segment_id,
segment_suffix,
)?;
let index_reader = FieldsIndexReader::open(&mut meta_in, tvx_input.as_ref())?;
let num_chunks = meta_in.read_vlong()?;
let num_dirty_chunks = meta_in.read_vlong()?;
let num_dirty_docs = meta_in.read_vlong()?;
if num_dirty_chunks > num_chunks {
return Err(io::Error::other(format!(
"invalid numDirtyChunks: dirty={num_dirty_chunks} total={num_chunks}"
)));
}
if (num_dirty_chunks == 0) != (num_dirty_docs == 0) {
return Err(io::Error::other(format!(
"dirty chunks/docs mismatch: dirtyChunks={num_dirty_chunks} dirtyDocs={num_dirty_docs}"
)));
}
if num_dirty_docs < num_dirty_chunks {
return Err(io::Error::other(format!(
"numDirtyDocs < numDirtyChunks: dirtyDocs={num_dirty_docs} dirtyChunks={num_dirty_chunks}"
)));
}
codec_util::check_footer(&mut meta_in)?;
debug!("term_vectors_reader: {num_chunks} chunks for segment {segment_name}");
Ok(Self {
vectors_stream,
index_reader,
version,
packed_ints_version,
chunk_size,
num_chunks,
num_dirty_chunks,
num_dirty_docs,
})
}
pub fn num_chunks(&self) -> i64 {
self.num_chunks
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codecs::lucene90::term_vectors;
use crate::index::indexing_chain::TermVectorDoc;
use crate::store::{MemoryDirectory, SharedDirectory};
use assertables::*;
fn test_directory() -> SharedDirectory {
SharedDirectory::new(Box::new(MemoryDirectory::new()))
}
fn make_tv_doc(field_number: u32, terms: Vec<&str>) -> TermVectorDoc {
use crate::index::indexing_chain::{OffsetBuffers, TermVectorField, TermVectorTerm};
let tv_terms: Vec<TermVectorTerm> = terms
.into_iter()
.map(|t| TermVectorTerm {
term: t.to_string(),
freq: 1,
positions: vec![0],
offsets: Some(Box::new(OffsetBuffers {
start_offsets: vec![0],
end_offsets: vec![t.len() as i32],
})),
})
.collect();
TermVectorDoc {
fields: vec![TermVectorField {
field_number,
has_positions: true,
has_offsets: true,
has_payloads: false,
terms: tv_terms,
}],
}
}
fn write_and_read(tv_docs: &[TermVectorDoc], num_docs: i32) -> CompressingTermVectorsReader {
let segment_id = [0u8; 16];
let dir = test_directory();
term_vectors::write(&dir, "_0", "", &segment_id, tv_docs, num_docs).unwrap();
let guard = dir.lock().unwrap();
CompressingTermVectorsReader::open(guard.as_ref(), "_0", "", &segment_id).unwrap()
}
#[test]
fn test_single_doc() {
let docs = vec![make_tv_doc(2, vec!["hello", "world"])];
let reader = write_and_read(&docs, 1);
assert_eq!(reader.num_chunks(), 1);
}
#[test]
fn test_multiple_docs_one_chunk() {
let docs: Vec<TermVectorDoc> = (0..10).map(|_| make_tv_doc(2, vec!["term"])).collect();
let reader = write_and_read(&docs, 10);
assert_eq!(reader.num_chunks(), 1);
}
#[test]
fn test_multiple_docs_multiple_fields() {
use crate::index::indexing_chain::{TermVectorField, TermVectorTerm};
let docs: Vec<TermVectorDoc> = (0..5)
.map(|_| TermVectorDoc {
fields: vec![
TermVectorField {
field_number: 0,
has_positions: true,
has_offsets: false,
has_payloads: false,
terms: vec![TermVectorTerm {
term: "alpha".to_string(),
freq: 1,
positions: vec![0],
offsets: None,
}],
},
TermVectorField {
field_number: 1,
has_positions: true,
has_offsets: false,
has_payloads: false,
terms: vec![TermVectorTerm {
term: "beta".to_string(),
freq: 1,
positions: vec![0],
offsets: None,
}],
},
],
})
.collect();
let reader = write_and_read(&docs, 5);
assert_ge!(reader.num_chunks(), 1);
}
}