hermes_core/segment/
reader.rs

1//! Async segment reader with lazy loading
2
3use std::sync::Arc;
4
5use rustc_hash::FxHashMap;
6
7use crate::directories::{AsyncFileRead, Directory, LazyFileHandle, LazyFileSlice};
8use crate::dsl::{Document, Field, Schema};
9use crate::structures::{
10    AsyncSSTableReader, BlockPostingList, BlockSparsePostingList, CoarseCentroids, IVFPQIndex,
11    IVFRaBitQIndex, PQCodebook, RaBitQCodebook, RaBitQIndex, SSTableStats, TermInfo,
12};
13use crate::{DocId, Error, Result};
14
15use super::store::{AsyncStoreReader, RawStoreBlock};
16use super::types::{SegmentFiles, SegmentId, SegmentMeta};
17use super::vector_data::FlatVectorData;
18
19/// Vector index type - Flat, RaBitQ, IVF-RaBitQ, or ScaNN (IVF-PQ)
20#[derive(Clone)]
21#[allow(clippy::upper_case_acronyms)]
22pub enum VectorIndex {
23    /// Flat - brute-force search over raw vectors (accumulating state)
24    Flat(Arc<FlatVectorData>),
25    /// RaBitQ - binary quantization, good for small datasets
26    RaBitQ(Arc<RaBitQIndex>),
27    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
28    IVF {
29        index: Arc<IVFRaBitQIndex>,
30        codebook: Arc<RaBitQCodebook>,
31    },
32    /// ScaNN (IVF-PQ) - product quantization with OPQ, best for large datasets
33    ScaNN {
34        index: Arc<IVFPQIndex>,
35        codebook: Arc<PQCodebook>,
36    },
37}
38
39/// Sparse vector index for a field: direct-indexed by dimension ID
40#[derive(Clone)]
41pub struct SparseIndex {
42    /// Posting lists indexed directly by dimension ID (O(1) lookup)
43    /// None means dimension not present in index
44    pub postings: Vec<Option<Arc<BlockSparsePostingList>>>,
45    /// Total document count in this segment (for IDF computation)
46    pub total_docs: u32,
47}
48
49impl SparseIndex {
50    /// Compute IDF (inverse document frequency) for a dimension
51    ///
52    /// IDF = log(N / df) where N = total docs, df = docs containing dimension
53    /// Returns 0.0 if dimension not present
54    #[inline]
55    pub fn idf(&self, dim_id: u32) -> f32 {
56        if let Some(Some(pl)) = self.postings.get(dim_id as usize) {
57            let df = pl.doc_count() as f32;
58            if df > 0.0 {
59                (self.total_docs as f32 / df).ln()
60            } else {
61                0.0
62            }
63        } else {
64            0.0
65        }
66    }
67
68    /// Get IDF weights for multiple dimensions
69    pub fn idf_weights(&self, dim_ids: &[u32]) -> Vec<f32> {
70        dim_ids.iter().map(|&d| self.idf(d)).collect()
71    }
72}
73
74/// Async segment reader with lazy loading
75///
76/// - Term dictionary: only index loaded, blocks loaded on-demand
77/// - Postings: loaded on-demand per term via HTTP range requests
78/// - Document store: only index loaded, blocks loaded on-demand via HTTP range requests
79pub struct AsyncSegmentReader {
80    meta: SegmentMeta,
81    /// Term dictionary with lazy block loading
82    term_dict: Arc<AsyncSSTableReader<TermInfo>>,
83    /// Postings file handle - fetches ranges on demand
84    postings_handle: LazyFileHandle,
85    /// Document store with lazy block loading
86    store: Arc<AsyncStoreReader>,
87    schema: Arc<Schema>,
88    /// Base doc_id offset for this segment
89    doc_id_offset: DocId,
90    /// Dense vector indexes per field (RaBitQ or IVF-RaBitQ)
91    vector_indexes: FxHashMap<u32, VectorIndex>,
92    /// Shared coarse centroids for IVF search (loaded once)
93    coarse_centroids: Option<Arc<CoarseCentroids>>,
94    /// Sparse vector indexes per field
95    sparse_indexes: FxHashMap<u32, SparseIndex>,
96    /// Position file handle for phrase queries (lazy loading)
97    positions_handle: Option<LazyFileHandle>,
98}
99
100impl AsyncSegmentReader {
101    /// Open a segment with lazy loading
102    pub async fn open<D: Directory>(
103        dir: &D,
104        segment_id: SegmentId,
105        schema: Arc<Schema>,
106        doc_id_offset: DocId,
107        cache_blocks: usize,
108    ) -> Result<Self> {
109        let files = SegmentFiles::new(segment_id.0);
110
111        // Read metadata (small, always loaded)
112        let meta_slice = dir.open_read(&files.meta).await?;
113        let meta_bytes = meta_slice.read_bytes().await?;
114        let meta = SegmentMeta::deserialize(meta_bytes.as_slice())?;
115        debug_assert_eq!(meta.id, segment_id.0);
116
117        // Open term dictionary with lazy loading (fetches ranges on demand)
118        let term_dict_handle = dir.open_lazy(&files.term_dict).await?;
119        let term_dict = AsyncSSTableReader::open(term_dict_handle, cache_blocks).await?;
120
121        // Get postings file handle (lazy - fetches ranges on demand)
122        let postings_handle = dir.open_lazy(&files.postings).await?;
123
124        // Open store with lazy loading
125        let store_handle = dir.open_lazy(&files.store).await?;
126        let store = AsyncStoreReader::open(store_handle, cache_blocks).await?;
127
128        // Load dense vector indexes from unified .vectors file
129        let (vector_indexes, coarse_centroids) =
130            Self::load_vectors_file(dir, &files, &schema).await?;
131
132        // Load sparse vector indexes from .sparse file
133        let sparse_indexes = Self::load_sparse_file(dir, &files, meta.num_docs).await?;
134
135        // Open positions file handle (if exists) - offsets are now in TermInfo
136        let positions_handle = Self::open_positions_file(dir, &files).await?;
137
138        Ok(Self {
139            meta,
140            term_dict: Arc::new(term_dict),
141            postings_handle,
142            store: Arc::new(store),
143            schema,
144            doc_id_offset,
145            vector_indexes,
146            coarse_centroids,
147            sparse_indexes,
148            positions_handle,
149        })
150    }
151
152    pub fn meta(&self) -> &SegmentMeta {
153        &self.meta
154    }
155
156    pub fn num_docs(&self) -> u32 {
157        self.meta.num_docs
158    }
159
160    /// Get average field length for BM25F scoring
161    pub fn avg_field_len(&self, field: Field) -> f32 {
162        self.meta.avg_field_len(field)
163    }
164
165    pub fn doc_id_offset(&self) -> DocId {
166        self.doc_id_offset
167    }
168
169    pub fn schema(&self) -> &Schema {
170        &self.schema
171    }
172
173    /// Get sparse indexes for all fields
174    pub fn sparse_indexes(&self) -> &FxHashMap<u32, SparseIndex> {
175        &self.sparse_indexes
176    }
177
178    /// Get vector indexes for all fields
179    pub fn vector_indexes(&self) -> &FxHashMap<u32, VectorIndex> {
180        &self.vector_indexes
181    }
182
183    /// Get term dictionary stats for debugging
184    pub fn term_dict_stats(&self) -> SSTableStats {
185        self.term_dict.stats()
186    }
187
188    /// Get posting list for a term (async - loads on demand)
189    ///
190    /// For small posting lists (1-3 docs), the data is inlined in the term dictionary
191    /// and no additional I/O is needed. For larger lists, reads from .post file.
192    pub async fn get_postings(
193        &self,
194        field: Field,
195        term: &[u8],
196    ) -> Result<Option<BlockPostingList>> {
197        log::debug!(
198            "SegmentReader::get_postings field={} term_len={}",
199            field.0,
200            term.len()
201        );
202
203        // Build key: field_id + term
204        let mut key = Vec::with_capacity(4 + term.len());
205        key.extend_from_slice(&field.0.to_le_bytes());
206        key.extend_from_slice(term);
207
208        // Look up in term dictionary
209        let term_info = match self.term_dict.get(&key).await? {
210            Some(info) => {
211                log::debug!("SegmentReader::get_postings found term_info");
212                info
213            }
214            None => {
215                log::debug!("SegmentReader::get_postings term not found");
216                return Ok(None);
217            }
218        };
219
220        // Check if posting list is inlined
221        if let Some((doc_ids, term_freqs)) = term_info.decode_inline() {
222            // Build BlockPostingList from inline data (no I/O needed!)
223            let mut posting_list = crate::structures::PostingList::with_capacity(doc_ids.len());
224            for (doc_id, tf) in doc_ids.into_iter().zip(term_freqs.into_iter()) {
225                posting_list.push(doc_id, tf);
226            }
227            let block_list = BlockPostingList::from_posting_list(&posting_list)?;
228            return Ok(Some(block_list));
229        }
230
231        // External posting list - read from postings file handle (lazy - HTTP range request)
232        let (posting_offset, posting_len) = term_info.external_info().ok_or_else(|| {
233            Error::Corruption("TermInfo has neither inline nor external data".to_string())
234        })?;
235
236        let start = posting_offset;
237        let end = start + posting_len as u64;
238
239        if end > self.postings_handle.len() {
240            return Err(Error::Corruption(
241                "Posting offset out of bounds".to_string(),
242            ));
243        }
244
245        let posting_bytes = self.postings_handle.read_bytes_range(start..end).await?;
246        let block_list = BlockPostingList::deserialize(&mut posting_bytes.as_slice())?;
247
248        Ok(Some(block_list))
249    }
250
251    /// Get document by local doc_id (async - loads on demand)
252    pub async fn doc(&self, local_doc_id: DocId) -> Result<Option<Document>> {
253        self.store
254            .get(local_doc_id, &self.schema)
255            .await
256            .map_err(Error::from)
257    }
258
259    /// Prefetch term dictionary blocks for a key range
260    pub async fn prefetch_terms(
261        &self,
262        field: Field,
263        start_term: &[u8],
264        end_term: &[u8],
265    ) -> Result<()> {
266        let mut start_key = Vec::with_capacity(4 + start_term.len());
267        start_key.extend_from_slice(&field.0.to_le_bytes());
268        start_key.extend_from_slice(start_term);
269
270        let mut end_key = Vec::with_capacity(4 + end_term.len());
271        end_key.extend_from_slice(&field.0.to_le_bytes());
272        end_key.extend_from_slice(end_term);
273
274        self.term_dict.prefetch_range(&start_key, &end_key).await?;
275        Ok(())
276    }
277
278    /// Check if store uses dictionary compression (incompatible with raw merging)
279    pub fn store_has_dict(&self) -> bool {
280        self.store.has_dict()
281    }
282
283    /// Get raw store blocks for optimized merging
284    pub fn store_raw_blocks(&self) -> Vec<RawStoreBlock> {
285        self.store.raw_blocks()
286    }
287
288    /// Get store data slice for raw block access
289    pub fn store_data_slice(&self) -> &LazyFileSlice {
290        self.store.data_slice()
291    }
292
293    /// Get all terms from this segment (for merge)
294    pub async fn all_terms(&self) -> Result<Vec<(Vec<u8>, TermInfo)>> {
295        self.term_dict.all_entries().await.map_err(Error::from)
296    }
297
298    /// Get all terms with parsed field and term string (for statistics aggregation)
299    ///
300    /// Returns (field, term_string, doc_freq) for each term in the dictionary.
301    /// Skips terms that aren't valid UTF-8.
302    pub async fn all_terms_with_stats(&self) -> Result<Vec<(Field, String, u32)>> {
303        let entries = self.term_dict.all_entries().await?;
304        let mut result = Vec::with_capacity(entries.len());
305
306        for (key, term_info) in entries {
307            // Key format: field_id (4 bytes little-endian) + term bytes
308            if key.len() > 4 {
309                let field_id = u32::from_le_bytes([key[0], key[1], key[2], key[3]]);
310                let term_bytes = &key[4..];
311                if let Ok(term_str) = std::str::from_utf8(term_bytes) {
312                    result.push((Field(field_id), term_str.to_string(), term_info.doc_freq()));
313                }
314            }
315        }
316
317        Ok(result)
318    }
319
320    /// Get streaming iterator over term dictionary (for memory-efficient merge)
321    pub fn term_dict_iter(&self) -> crate::structures::AsyncSSTableIterator<'_, TermInfo> {
322        self.term_dict.iter()
323    }
324
325    /// Read raw posting bytes at offset
326    pub async fn read_postings(&self, offset: u64, len: u32) -> Result<Vec<u8>> {
327        let start = offset;
328        let end = start + len as u64;
329        let bytes = self.postings_handle.read_bytes_range(start..end).await?;
330        Ok(bytes.to_vec())
331    }
332
333    /// Search dense vectors using RaBitQ
334    ///
335    /// Returns (doc_id, score) pairs sorted by score (descending).
336    /// The doc_ids are adjusted by doc_id_offset for this segment.
337    /// If mrl_dim is configured, the query vector is automatically trimmed.
338    /// For multi-valued documents, scores are combined using the specified combiner.
339    pub fn search_dense_vector(
340        &self,
341        field: Field,
342        query: &[f32],
343        k: usize,
344        rerank_factor: usize,
345        combiner: crate::query::MultiValueCombiner,
346    ) -> Result<Vec<(DocId, f32)>> {
347        use crate::query::MultiValueCombiner;
348        let index = self
349            .vector_indexes
350            .get(&field.0)
351            .ok_or_else(|| Error::Schema(format!("No dense vector index for field {}", field.0)))?;
352
353        // Get mrl_dim from config to trim query vector if needed
354        let mrl_dim = self
355            .schema
356            .get_field_entry(field)
357            .and_then(|e| e.dense_vector_config.as_ref())
358            .and_then(|c| c.mrl_dim);
359
360        // Trim query vector if mrl_dim is set
361        let query_vec: Vec<f32>;
362        let effective_query = if let Some(trim_dim) = mrl_dim {
363            if trim_dim < query.len() {
364                query_vec = query[..trim_dim].to_vec();
365                query_vec.as_slice()
366            } else {
367                query
368            }
369        } else {
370            query
371        };
372
373        let results: Vec<(u32, f32)> = match index {
374            VectorIndex::Flat(flat_data) => {
375                // Brute-force search over raw vectors using SIMD-accelerated distance
376                use crate::structures::simd::squared_euclidean_distance;
377
378                let mut candidates: Vec<(u32, f32)> = flat_data
379                    .vectors
380                    .iter()
381                    .zip(flat_data.doc_ids.iter())
382                    .map(|(vec, &doc_id)| {
383                        let dist = squared_euclidean_distance(effective_query, vec);
384                        (doc_id, dist)
385                    })
386                    .collect();
387                candidates
388                    .sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
389                candidates.truncate(k);
390                candidates
391            }
392            VectorIndex::RaBitQ(rabitq) => rabitq
393                .search(effective_query, k, rerank_factor)
394                .into_iter()
395                .map(|(idx, dist)| (idx as u32, dist))
396                .collect(),
397            VectorIndex::IVF { index, codebook } => {
398                let centroids = self.coarse_centroids.as_ref().ok_or_else(|| {
399                    Error::Schema("IVF index requires coarse centroids".to_string())
400                })?;
401                let nprobe = rerank_factor.max(32); // Use rerank_factor as nprobe hint
402                index.search(centroids, codebook, effective_query, k, Some(nprobe))
403            }
404            VectorIndex::ScaNN { index, codebook } => {
405                let centroids = self.coarse_centroids.as_ref().ok_or_else(|| {
406                    Error::Schema("ScaNN index requires coarse centroids".to_string())
407                })?;
408                let nprobe = rerank_factor.max(32);
409                index.search(centroids, codebook, effective_query, k, Some(nprobe))
410            }
411        };
412
413        // Convert distance to score (smaller distance = higher score)
414        // and adjust doc_ids by segment offset
415        let raw_results: Vec<(DocId, f32)> = results
416            .into_iter()
417            .map(|(idx, dist)| {
418                let doc_id = idx as DocId + self.doc_id_offset;
419                let score = 1.0 / (1.0 + dist); // Convert distance to similarity score
420                (doc_id, score)
421            })
422            .collect();
423
424        // Combine scores for duplicate doc_ids (multi-valued documents)
425        let mut combined: rustc_hash::FxHashMap<DocId, (f32, u32)> =
426            rustc_hash::FxHashMap::default();
427        for (doc_id, score) in raw_results {
428            combined
429                .entry(doc_id)
430                .and_modify(|(acc_score, count)| match combiner {
431                    MultiValueCombiner::Sum => *acc_score += score,
432                    MultiValueCombiner::Max => *acc_score = acc_score.max(score),
433                    MultiValueCombiner::Avg => {
434                        *acc_score += score;
435                        *count += 1;
436                    }
437                })
438                .or_insert((score, 1));
439        }
440
441        // Finalize averages and collect results
442        let mut final_results: Vec<(DocId, f32)> = combined
443            .into_iter()
444            .map(|(doc_id, (score, count))| {
445                let final_score = if combiner == MultiValueCombiner::Avg {
446                    score / count as f32
447                } else {
448                    score
449                };
450                (doc_id, final_score)
451            })
452            .collect();
453
454        // Sort by score descending and take top k
455        final_results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
456        final_results.truncate(k);
457
458        Ok(final_results)
459    }
460
461    /// Check if this segment has a dense vector index for the given field
462    pub fn has_dense_vector_index(&self, field: Field) -> bool {
463        self.vector_indexes.contains_key(&field.0)
464    }
465
466    /// Get the dense vector index for a field (if available)
467    pub fn get_dense_vector_index(&self, field: Field) -> Option<Arc<RaBitQIndex>> {
468        match self.vector_indexes.get(&field.0) {
469            Some(VectorIndex::RaBitQ(idx)) => Some(idx.clone()),
470            _ => None,
471        }
472    }
473
474    /// Get the IVF vector index for a field (if available)
475    pub fn get_ivf_vector_index(&self, field: Field) -> Option<Arc<IVFRaBitQIndex>> {
476        match self.vector_indexes.get(&field.0) {
477            Some(VectorIndex::IVF { index, .. }) => Some(index.clone()),
478            _ => None,
479        }
480    }
481
482    /// Get the ScaNN vector index for a field (if available)
483    pub fn get_scann_vector_index(
484        &self,
485        field: Field,
486    ) -> Option<(Arc<IVFPQIndex>, Arc<PQCodebook>)> {
487        match self.vector_indexes.get(&field.0) {
488            Some(VectorIndex::ScaNN { index, codebook }) => Some((index.clone(), codebook.clone())),
489            _ => None,
490        }
491    }
492
493    /// Get the vector index type for a field
494    pub fn get_vector_index(&self, field: Field) -> Option<&VectorIndex> {
495        self.vector_indexes.get(&field.0)
496    }
497
498    /// Search for similar sparse vectors using dedicated sparse posting lists
499    ///
500    /// Uses shared `WandExecutor` with `SparseTermScorer` for efficient top-k retrieval.
501    /// Optimizations (via WandExecutor):
502    /// 1. **MaxScore pruning**: Dimensions sorted by max contribution
503    /// 2. **Block-Max WAND**: Skips blocks where max contribution < threshold
504    /// 3. **Top-K heap**: Efficient score collection
505    ///
506    /// Returns (doc_id, score) pairs sorted by score descending.
507    pub async fn search_sparse_vector(
508        &self,
509        field: Field,
510        vector: &[(u32, f32)],
511        limit: usize,
512        combiner: crate::query::MultiValueCombiner,
513    ) -> Result<Vec<(u32, f32)>> {
514        use crate::query::{MultiValueCombiner, SparseTermScorer, WandExecutor};
515
516        // Get sparse index for this field
517        let sparse_index = match self.sparse_indexes.get(&field.0) {
518            Some(idx) => idx,
519            None => return Ok(Vec::new()),
520        };
521
522        // Build scorers for each dimension that exists in the index
523        let scorers: Vec<SparseTermScorer> = vector
524            .iter()
525            .filter_map(|&(dim_id, query_weight)| {
526                // Direct indexing: O(1) lookup
527                sparse_index
528                    .postings
529                    .get(dim_id as usize)
530                    .and_then(|opt| opt.as_ref())
531                    .map(|pl| SparseTermScorer::from_arc(pl, query_weight))
532            })
533            .collect();
534
535        if scorers.is_empty() {
536            return Ok(Vec::new());
537        }
538
539        // Use shared WandExecutor for top-k retrieval
540        // Note: For multi-valued fields, same doc_id may appear multiple times
541        // with different scores that need to be combined
542        let raw_results = WandExecutor::new(scorers, limit * 2).execute(); // Over-fetch for combining
543
544        // Combine scores for duplicate doc_ids based on combiner strategy
545        let mut combined: rustc_hash::FxHashMap<u32, (f32, u32)> = rustc_hash::FxHashMap::default();
546        for r in raw_results {
547            combined
548                .entry(r.doc_id)
549                .and_modify(|(score, count)| match combiner {
550                    MultiValueCombiner::Sum => *score += r.score,
551                    MultiValueCombiner::Max => *score = score.max(r.score),
552                    MultiValueCombiner::Avg => {
553                        *score += r.score;
554                        *count += 1;
555                    }
556                })
557                .or_insert((r.score, 1));
558        }
559
560        // Finalize averages and collect results
561        let mut results: Vec<(u32, f32)> = combined
562            .into_iter()
563            .map(|(doc_id, (score, count))| {
564                let final_score = if combiner == MultiValueCombiner::Avg {
565                    score / count as f32
566                } else {
567                    score
568                };
569                (doc_id, final_score)
570            })
571            .collect();
572
573        // Sort by score descending and take top limit
574        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
575        results.truncate(limit);
576
577        Ok(results)
578    }
579
580    /// Load dense vector indexes from unified .vectors file
581    ///
582    /// Supports RaBitQ (type 0), IVF-RaBitQ (type 1), and ScaNN (type 2).
583    /// Also loads coarse centroids and PQ codebook as needed.
584    ///
585    /// Memory optimization: Uses lazy range reads to load each index separately,
586    /// avoiding loading the entire vectors file into memory at once.
587    async fn load_vectors_file<D: Directory>(
588        dir: &D,
589        files: &SegmentFiles,
590        schema: &Schema,
591    ) -> Result<(FxHashMap<u32, VectorIndex>, Option<Arc<CoarseCentroids>>)> {
592        use byteorder::{LittleEndian, ReadBytesExt};
593        use std::io::Cursor;
594
595        let mut indexes = FxHashMap::default();
596        let mut coarse_centroids: Option<Arc<CoarseCentroids>> = None;
597
598        // Skip loading vectors file if schema has no dense vector fields
599        let has_dense_vectors = schema
600            .fields()
601            .any(|(_, entry)| entry.dense_vector_config.is_some());
602        if !has_dense_vectors {
603            return Ok((indexes, None));
604        }
605
606        // Try to open vectors file (may not exist if no vectors were indexed)
607        let handle = match dir.open_lazy(&files.vectors).await {
608            Ok(h) => h,
609            Err(_) => return Ok((indexes, None)),
610        };
611
612        // Read only the header first (4 bytes for num_fields)
613        let header_bytes = match handle.read_bytes_range(0..4).await {
614            Ok(b) => b,
615            Err(_) => return Ok((indexes, None)),
616        };
617
618        if header_bytes.is_empty() {
619            return Ok((indexes, None));
620        }
621
622        let mut cursor = Cursor::new(header_bytes.as_slice());
623        let num_fields = cursor.read_u32::<LittleEndian>()?;
624
625        if num_fields == 0 {
626            return Ok((indexes, None));
627        }
628
629        // Read field entries header: (field_id: 4, index_type: 1, offset: 8, length: 8) = 21 bytes per field
630        let entries_size = num_fields as u64 * 21;
631        let entries_bytes = handle.read_bytes_range(4..4 + entries_size).await?;
632        let mut cursor = Cursor::new(entries_bytes.as_slice());
633
634        // Read field entries (field_id, index_type, offset, length)
635        let mut entries = Vec::with_capacity(num_fields as usize);
636        for _ in 0..num_fields {
637            let field_id = cursor.read_u32::<LittleEndian>()?;
638            // Try to read index_type - if this fails, assume old format without type
639            let index_type = cursor.read_u8().unwrap_or(255); // 255 = unknown/legacy
640            let offset = cursor.read_u64::<LittleEndian>()?;
641            let length = cursor.read_u64::<LittleEndian>()?;
642            entries.push((field_id, index_type, offset, length));
643        }
644
645        // Load each index on-demand using range reads (memory efficient)
646        for (field_id, index_type, offset, length) in entries {
647            // Read only this index's data
648            let data = handle.read_bytes_range(offset..offset + length).await?;
649            let _field = crate::dsl::Field(field_id);
650
651            match index_type {
652                3 => {
653                    // Flat (brute-force) - raw vectors for accumulating state
654                    if let Ok(flat_data) = serde_json::from_slice::<FlatVectorData>(data.as_slice())
655                    {
656                        indexes.insert(field_id, VectorIndex::Flat(Arc::new(flat_data)));
657                    }
658                }
659                2 => {
660                    // ScaNN (IVF-PQ) with embedded centroids and codebook
661                    use super::vector_data::ScaNNIndexData;
662                    if let Ok(scann_data) = ScaNNIndexData::from_bytes(data.as_slice()) {
663                        coarse_centroids = Some(Arc::new(scann_data.centroids));
664                        indexes.insert(
665                            field_id,
666                            VectorIndex::ScaNN {
667                                index: Arc::new(scann_data.index),
668                                codebook: Arc::new(scann_data.codebook),
669                            },
670                        );
671                    }
672                }
673                1 => {
674                    // IVF-RaBitQ with embedded centroids and codebook
675                    use super::vector_data::IVFRaBitQIndexData;
676                    if let Ok(ivf_data) = IVFRaBitQIndexData::from_bytes(data.as_slice()) {
677                        coarse_centroids = Some(Arc::new(ivf_data.centroids));
678                        indexes.insert(
679                            field_id,
680                            VectorIndex::IVF {
681                                index: Arc::new(ivf_data.index),
682                                codebook: Arc::new(ivf_data.codebook),
683                            },
684                        );
685                    }
686                }
687                0 => {
688                    // RaBitQ (standalone)
689                    if let Ok(rabitq_index) = serde_json::from_slice::<RaBitQIndex>(data.as_slice())
690                    {
691                        indexes.insert(field_id, VectorIndex::RaBitQ(Arc::new(rabitq_index)));
692                    }
693                }
694                _ => {
695                    // Unknown type - try Flat first (most common in new indexes)
696                    if let Ok(flat_data) = serde_json::from_slice::<FlatVectorData>(data.as_slice())
697                    {
698                        indexes.insert(field_id, VectorIndex::Flat(Arc::new(flat_data)));
699                    } else if let Ok(rabitq_index) =
700                        serde_json::from_slice::<RaBitQIndex>(data.as_slice())
701                    {
702                        indexes.insert(field_id, VectorIndex::RaBitQ(Arc::new(rabitq_index)));
703                    }
704                }
705            }
706        }
707
708        Ok((indexes, coarse_centroids))
709    }
710
711    /// Load sparse vector indexes from .sparse file
712    ///
713    /// File format (direct-indexed table for O(1) dimension lookup):
714    /// - Header: num_fields (u32)
715    /// - For each field:
716    ///   - field_id (u32)
717    ///   - quantization (u8)
718    ///   - max_dim_id (u32)          ← table size
719    ///   - table: [(offset: u64, length: u32)] × max_dim_id  ← direct indexed
720    ///     (offset=0, length=0 means dimension not present)
721    /// - Data: concatenated serialized BlockSparsePostingList
722    async fn load_sparse_file<D: Directory>(
723        dir: &D,
724        files: &SegmentFiles,
725        total_docs: u32,
726    ) -> Result<FxHashMap<u32, SparseIndex>> {
727        use byteorder::{LittleEndian, ReadBytesExt};
728        use std::io::Cursor;
729
730        let mut indexes = FxHashMap::default();
731
732        // Try to open sparse file (may not exist if no sparse vectors were indexed)
733        let handle = match dir.open_lazy(&files.sparse).await {
734            Ok(h) => h,
735            Err(_) => return Ok(indexes),
736        };
737
738        // Read the entire file (sparse files are typically small enough)
739        let data = match handle.read_bytes().await {
740            Ok(d) => d,
741            Err(_) => return Ok(indexes),
742        };
743
744        if data.len() < 4 {
745            return Ok(indexes);
746        }
747
748        let mut cursor = Cursor::new(data.as_slice());
749        let num_fields = cursor.read_u32::<LittleEndian>()?;
750
751        if num_fields == 0 {
752            return Ok(indexes);
753        }
754
755        // Read field entries and build indexes
756        for _ in 0..num_fields {
757            let field_id = cursor.read_u32::<LittleEndian>()?;
758            let _quantization = cursor.read_u8()?; // Already stored in each BlockSparsePostingList
759            let max_dim_id = cursor.read_u32::<LittleEndian>()?;
760
761            // Read direct-indexed table
762            let mut postings: Vec<Option<Arc<BlockSparsePostingList>>> =
763                vec![None; max_dim_id as usize];
764
765            for dim_id in 0..max_dim_id {
766                let offset = cursor.read_u64::<LittleEndian>()?;
767                let length = cursor.read_u32::<LittleEndian>()?;
768
769                // offset=0, length=0 means dimension not present
770                if length > 0 {
771                    let start = offset as usize;
772                    let end = start + length as usize;
773                    if end <= data.len() {
774                        let posting_data = &data.as_slice()[start..end];
775                        if let Ok(posting_list) =
776                            BlockSparsePostingList::deserialize(&mut Cursor::new(posting_data))
777                        {
778                            postings[dim_id as usize] = Some(Arc::new(posting_list));
779                        }
780                    }
781                }
782            }
783
784            indexes.insert(
785                field_id,
786                SparseIndex {
787                    postings,
788                    total_docs,
789                },
790            );
791        }
792
793        Ok(indexes)
794    }
795
796    /// Load position index header from .pos file
797    ///
798    /// File format:
799    /// Open positions file handle (no header parsing needed - offsets are in TermInfo)
800    async fn open_positions_file<D: Directory>(
801        dir: &D,
802        files: &SegmentFiles,
803    ) -> Result<Option<LazyFileHandle>> {
804        // Try to open positions file (may not exist if no positions were indexed)
805        match dir.open_lazy(&files.positions).await {
806            Ok(h) => Ok(Some(h)),
807            Err(_) => Ok(None),
808        }
809    }
810
811    /// Get positions for a term (for phrase queries)
812    ///
813    /// Position offsets are now embedded in TermInfo, so we first look up
814    /// the term to get its TermInfo, then use position_info() to get the offset.
815    pub async fn get_positions(
816        &self,
817        field: Field,
818        term: &[u8],
819    ) -> Result<Option<crate::structures::PositionPostingList>> {
820        use std::io::Cursor;
821
822        // Get positions handle
823        let handle = match &self.positions_handle {
824            Some(h) => h,
825            None => return Ok(None),
826        };
827
828        // Build key: field_id + term
829        let mut key = Vec::with_capacity(4 + term.len());
830        key.extend_from_slice(&field.0.to_le_bytes());
831        key.extend_from_slice(term);
832
833        // Look up term in dictionary to get TermInfo with position offset
834        let term_info = match self.term_dict.get(&key).await? {
835            Some(info) => info,
836            None => return Ok(None),
837        };
838
839        // Get position offset from TermInfo
840        let (offset, length) = match term_info.position_info() {
841            Some((o, l)) => (o, l),
842            None => return Ok(None),
843        };
844
845        // Read the position data
846        let slice = handle.slice(offset..offset + length as u64);
847        let data = slice.read_bytes().await?;
848
849        // Deserialize
850        let mut cursor = Cursor::new(data.as_slice());
851        let pos_list = crate::structures::PositionPostingList::deserialize(&mut cursor)?;
852
853        Ok(Some(pos_list))
854    }
855
856    /// Check if positions are available for a field
857    pub fn has_positions(&self, field: Field) -> bool {
858        // Check schema for position mode on this field
859        if let Some(entry) = self.schema.get_field_entry(field) {
860            entry.positions.is_some()
861        } else {
862            false
863        }
864    }
865}
866
867/// Alias for AsyncSegmentReader
868pub type SegmentReader = AsyncSegmentReader;
hermes_core/segment/reader.rs

hermes_core/segment/
reader.rs