Struct AsyncSegmentReader

Source

pub struct AsyncSegmentReader { /* private fields */ }

Expand description

Async segment reader with lazy loading

Term dictionary: only index loaded, blocks loaded on-demand
Postings: loaded on-demand per term via HTTP range requests
Document store: only index loaded, blocks loaded on-demand via HTTP range requests

Implementations§

Source §

impl AsyncSegmentReader

Source

pub async fn open<D: Directory>( dir: &D, segment_id: SegmentId, schema: Arc<Schema>, doc_id_offset: DocId, cache_blocks: usize, ) -> Result<Self>

Open a segment with lazy loading

Source

pub fn meta(&self) -> &SegmentMeta

Source

pub fn num_docs(&self) -> u32

Source

pub fn avg_field_len(&self, field: Field) -> f32

Get average field length for BM25F scoring

Source

pub fn doc_id_offset(&self) -> DocId

Source

pub fn set_doc_id_offset(&mut self, offset: DocId)

Set the doc_id_offset (used for parallel segment loading)

Source

pub fn schema(&self) -> &Schema

Source

pub fn sparse_indexes(&self) -> &FxHashMap<u32, SparseIndex>

Get sparse indexes for all fields

Source

pub fn vector_indexes(&self) -> &FxHashMap<u32, VectorIndex>

Get vector indexes for all fields

Source

pub fn flat_vectors(&self) -> &FxHashMap<u32, LazyFlatVectorData>

Get lazy flat vectors for all fields (for reranking and merge)

Source

pub fn term_dict_stats(&self) -> SSTableStats

Get term dictionary stats for debugging

Source

pub fn memory_stats(&self) -> SegmentMemoryStats

Estimate memory usage of this segment reader

Source

pub async fn get_postings( &self, field: Field, term: &[u8], ) -> Result<Option<BlockPostingList>>

Get posting list for a term (async - loads on demand)

For small posting lists (1-3 docs), the data is inlined in the term dictionary and no additional I/O is needed. For larger lists, reads from .post file.

Source

pub async fn doc(&self, local_doc_id: DocId) -> Result<Option<Document>>

Get document by local doc_id (async - loads on demand).

Dense vector fields are hydrated from LazyFlatVectorData (not stored in .store). Uses binary search on sorted doc_ids for O(log N) lookup.

Source

pub async fn doc_with_fields( &self, local_doc_id: DocId, fields: Option<&FxHashSet<u32>>, ) -> Result<Option<Document>>

Get document by local doc_id, hydrating only the specified fields.

If fields is None, all fields (including dense vectors) are hydrated. If fields is Some(set), only dense vector fields in the set are hydrated, skipping expensive mmap reads + dequantization for unrequested vector fields.

Source

pub async fn prefetch_terms( &self, field: Field, start_term: &[u8], end_term: &[u8], ) -> Result<()>

Prefetch term dictionary blocks for a key range

Source

pub fn store_has_dict(&self) -> bool

Check if store uses dictionary compression (incompatible with raw merging)

Source

pub fn store(&self) -> &AsyncStoreReader

Get store reference for merge operations

Source

pub fn store_raw_blocks(&self) -> Vec<RawStoreBlock>

Get raw store blocks for optimized merging

Source

pub fn store_data_slice(&self) -> &LazyFileSlice

Get store data slice for raw block access

Source

pub async fn all_terms(&self) -> Result<Vec<(Vec<u8>, TermInfo)>>

Get all terms from this segment (for merge)

Source

pub async fn all_terms_with_stats(&self) -> Result<Vec<(Field, String, u32)>>

Get all terms with parsed field and term string (for statistics aggregation)

Returns (field, term_string, doc_freq) for each term in the dictionary. Skips terms that aren’t valid UTF-8.

Source

pub fn term_dict_iter(&self) -> AsyncSSTableIterator<'_, TermInfo>

Get streaming iterator over term dictionary (for memory-efficient merge)

Source

pub async fn prefetch_term_dict(&self) -> Result<()>

Prefetch all term dictionary blocks in a single bulk I/O call.

Call before merge iteration to eliminate per-block cache misses.

Source

pub async fn read_postings(&self, offset: u64, len: u32) -> Result<Vec<u8>>

Read raw posting bytes at offset

Source

pub async fn read_position_bytes( &self, offset: u64, len: u32, ) -> Result<Option<Vec<u8>>>

Read raw position bytes at offset (for merge)

Source

pub fn has_positions_file(&self) -> bool

Check if this segment has a positions file

Source

pub async fn search_dense_vector( &self, field: Field, query: &[f32], k: usize, nprobe: usize, rerank_factor: usize, combiner: MultiValueCombiner, ) -> Result<Vec<VectorSearchResult>>

Search dense vectors using RaBitQ

Returns VectorSearchResult with ordinal tracking for multi-value fields. The doc_ids are adjusted by doc_id_offset for this segment. For multi-valued documents, scores are combined using the specified combiner.

Source

pub fn has_dense_vector_index(&self, field: Field) -> bool

Check if this segment has dense vectors for the given field

Source

pub fn get_dense_vector_index(&self, field: Field) -> Option<Arc<RaBitQIndex>>

Get the dense vector index for a field (if available)

Source

pub fn get_ivf_vector_index( &self, field: Field, ) -> Option<(Arc<IVFRaBitQIndex>, Arc<RaBitQCodebook>)>

Get the IVF vector index for a field (if available)

Source

pub fn coarse_centroids(&self, field_id: u32) -> Option<&Arc<CoarseCentroids>>

Get coarse centroids for a field

Source

pub fn set_coarse_centroids( &mut self, centroids: FxHashMap<u32, Arc<CoarseCentroids>>, )

Set per-field coarse centroids from index-level trained structures

Source

pub fn get_scann_vector_index( &self, field: Field, ) -> Option<(Arc<IVFPQIndex>, Arc<PQCodebook>)>

Get the ScaNN vector index for a field (if available)

Source

pub fn get_vector_index(&self, field: Field) -> Option<&VectorIndex>

Get the vector index type for a field

Source

pub async fn search_sparse_vector( &self, field: Field, vector: &[(u32, f32)], limit: usize, combiner: MultiValueCombiner, heap_factor: f32, ) -> Result<Vec<VectorSearchResult>>

Search for similar sparse vectors using dedicated sparse posting lists

Uses shared WandExecutor with SparseTermScorer for efficient top-k retrieval. Optimizations (via WandExecutor):

MaxScore pruning: Dimensions sorted by max contribution
Block-Max WAND: Skips blocks where max contribution < threshold
Top-K heap: Efficient score collection

Returns VectorSearchResult with ordinal tracking for multi-value fields.

Source

pub async fn get_positions( &self, field: Field, term: &[u8], ) -> Result<Option<PositionPostingList>>

Get positions for a term (for phrase queries)

Position offsets are now embedded in TermInfo, so we first look up the term to get its TermInfo, then use position_info() to get the offset.

Source

pub fn has_positions(&self, field: Field) -> bool

Check if positions are available for a field

Auto Trait Implementations§

§

impl !UnwindSafe for AsyncSegmentReader

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §