use std::io;
use log::debug;
use crate::codecs::codec_file_handle::{CodecFileHandle, IndexFile};
use crate::codecs::codec_headers;
use crate::codecs::lucene90::stored_fields_reader::FieldsIndexReader;
use crate::store::{Directory, FileBacking};
pub struct TermVectorsReader {
#[expect(dead_code)]
tvd: FileBacking,
#[expect(dead_code)]
tvx: FileBacking,
#[expect(dead_code)]
index_reader: FieldsIndexReader,
#[expect(dead_code)]
version: i32,
#[expect(dead_code)]
packed_ints_version: i32,
#[expect(dead_code)]
chunk_size: i32,
num_chunks: i64,
#[expect(dead_code)]
num_dirty_chunks: i64,
#[expect(dead_code)]
num_dirty_docs: i64,
}
impl TermVectorsReader {
pub fn open(
directory: &dyn Directory,
segment_name: &str,
segment_suffix: &str,
segment_id: &[u8; codec_headers::ID_LENGTH],
) -> io::Result<Self> {
let tvd = CodecFileHandle::open(
directory,
IndexFile::TermVectorsData,
segment_name,
segment_id,
segment_suffix,
)?;
let tvm = CodecFileHandle::open(
directory,
IndexFile::TermVectorsMeta,
segment_name,
segment_id,
segment_suffix,
)?;
let tvx = CodecFileHandle::open(
directory,
IndexFile::TermVectorsIndex,
segment_name,
segment_id,
segment_suffix,
)?;
let version = tvd.version();
let mut meta_in = tvm.body();
let packed_ints_version = meta_in.read_vint()?;
let chunk_size = meta_in.read_vint()?;
let index_reader = FieldsIndexReader::open(&mut meta_in)?;
let num_chunks = meta_in.read_vlong()?;
let num_dirty_chunks = meta_in.read_vlong()?;
let num_dirty_docs = meta_in.read_vlong()?;
if num_dirty_chunks > num_chunks {
return Err(io::Error::other(format!(
"invalid numDirtyChunks: dirty={num_dirty_chunks} total={num_chunks}"
)));
}
if (num_dirty_chunks == 0) != (num_dirty_docs == 0) {
return Err(io::Error::other(format!(
"dirty chunks/docs mismatch: dirtyChunks={num_dirty_chunks} dirtyDocs={num_dirty_docs}"
)));
}
if num_dirty_docs < num_dirty_chunks {
return Err(io::Error::other(format!(
"numDirtyDocs < numDirtyChunks: dirtyDocs={num_dirty_docs} dirtyChunks={num_dirty_chunks}"
)));
}
debug!("term_vectors_reader: {num_chunks} chunks for segment {segment_name}");
Ok(Self {
tvd: tvd.into_backing(),
tvx: tvx.into_backing(),
index_reader,
version,
packed_ints_version,
chunk_size,
num_chunks,
num_dirty_chunks,
num_dirty_docs,
})
}
pub fn num_chunks(&self) -> i64 {
self.num_chunks
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codecs::lucene90::term_vectors::{CompressingTermVectorsWriter, TermVectorsWriter};
use crate::store::{MemoryDirectory, SharedDirectory};
use assertables::*;
fn test_directory() -> SharedDirectory {
MemoryDirectory::create()
}
fn segment_id() -> [u8; 16] {
[0u8; 16]
}
fn write_and_read<F>(num_docs: i32, build_fn: F) -> TermVectorsReader
where
F: FnOnce(&mut CompressingTermVectorsWriter),
{
let dir = test_directory();
{
let mut w = CompressingTermVectorsWriter::new(&dir, "_0", "", &segment_id()).unwrap();
build_fn(&mut w);
w.finish(num_docs).unwrap();
}
TermVectorsReader::open(&dir, "_0", "", &segment_id()).unwrap()
}
#[test]
fn test_single_doc() {
let reader = write_and_read(1, |w| {
w.start_document(1);
w.start_field(2, 2, false, false, false);
w.start_term(b"hello", 1);
w.finish_term();
w.start_term(b"world", 1);
w.finish_term();
w.finish_field();
w.finish_document().unwrap();
});
assert_eq!(reader.num_chunks(), 1);
}
#[test]
fn test_multiple_docs_one_chunk() {
let reader = write_and_read(10, |w| {
for _ in 0..10 {
w.start_document(1);
w.start_field(2, 1, false, false, false);
w.start_term(b"term", 1);
w.finish_term();
w.finish_field();
w.finish_document().unwrap();
}
});
assert_eq!(reader.num_chunks(), 1);
}
#[test]
fn test_multiple_docs_multiple_fields() {
let reader = write_and_read(5, |w| {
for _ in 0..5 {
w.start_document(2);
w.start_field(0, 1, false, false, false);
w.start_term(b"alpha", 1);
w.finish_term();
w.finish_field();
w.start_field(1, 1, false, false, false);
w.start_term(b"beta", 1);
w.finish_term();
w.finish_field();
w.finish_document().unwrap();
}
});
assert_ge!(reader.num_chunks(), 1);
}
}