Skip to main content

hermes_core/segment/
mod.rs

1pub(crate) mod ann_build;
2#[cfg(feature = "native")]
3mod builder;
4pub(crate) mod format;
5#[cfg(feature = "native")]
6mod merger;
7mod reader;
8mod store;
9#[cfg(feature = "native")]
10mod tracker;
11mod types;
12mod vector_data;
13
14#[cfg(feature = "native")]
15pub use builder::{MemoryBreakdown, SegmentBuilder, SegmentBuilderConfig, SegmentBuilderStats};
16#[cfg(feature = "native")]
17pub use merger::{MergeStats, SegmentMerger, delete_segment};
18pub(crate) use reader::BmpIndex;
19pub(crate) use reader::bmp::BMP_SUPERBLOCK_SIZE;
20pub(crate) use reader::bmp::{
21    accumulate_u4_weighted, block_term_postings, compute_block_masks_4bit, find_dim_in_block_data,
22};
23pub(crate) use reader::combine_ordinal_results;
24pub use reader::{SegmentReader, SparseIndex, VectorIndex, VectorSearchResult};
25pub use store::*;
26#[cfg(feature = "native")]
27pub use tracker::{SegmentSnapshot, SegmentTracker};
28pub use types::{FieldStats, SegmentFiles, SegmentId, SegmentMeta, TrainedVectorStructures};
29pub use vector_data::{
30    FlatVectorData, IVFRaBitQIndexData, LazyFlatVectorData, ScaNNIndexData, dequantize_raw,
31};
32
33/// Format byte count as human-readable string
34#[cfg(feature = "native")]
35pub(crate) fn format_bytes(bytes: usize) -> String {
36    if bytes >= 1024 * 1024 * 1024 {
37        format!("{:.2} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
38    } else if bytes >= 1024 * 1024 {
39        format!("{:.2} MB", bytes as f64 / (1024.0 * 1024.0))
40    } else if bytes >= 1024 {
41        format!("{:.2} KB", bytes as f64 / 1024.0)
42    } else {
43        format!("{} B", bytes)
44    }
45}
46
47/// Write adapter that tracks bytes written.
48///
49/// Concrete type so it works with generic `serialize<W: Write>` functions
50/// (unlike `dyn StreamingWriter` which isn't `Sized`).
51#[cfg(feature = "native")]
52pub(crate) struct OffsetWriter {
53    inner: Box<dyn crate::directories::StreamingWriter>,
54    offset: u64,
55}
56
57#[cfg(feature = "native")]
58impl OffsetWriter {
59    pub(crate) fn new(inner: Box<dyn crate::directories::StreamingWriter>) -> Self {
60        Self { inner, offset: 0 }
61    }
62
63    /// Current write position (total bytes written so far).
64    pub(crate) fn offset(&self) -> u64 {
65        self.offset
66    }
67
68    /// Finalize the underlying streaming writer.
69    pub(crate) fn finish(self) -> std::io::Result<()> {
70        self.inner.finish()
71    }
72}
73
74#[cfg(feature = "native")]
75impl std::io::Write for OffsetWriter {
76    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
77        let n = self.inner.write(buf)?;
78        self.offset += n as u64;
79        Ok(n)
80    }
81
82    fn flush(&mut self) -> std::io::Result<()> {
83        self.inner.flush()
84    }
85}
86
87#[cfg(test)]
88#[cfg(feature = "native")]
89mod tests {
90    use super::*;
91    use crate::directories::RamDirectory;
92    use crate::dsl::SchemaBuilder;
93    use std::sync::Arc;
94
95    #[tokio::test]
96    async fn test_async_segment_reader() {
97        let mut schema_builder = SchemaBuilder::default();
98        let title = schema_builder.add_text_field("title", true, true);
99        let schema = Arc::new(schema_builder.build());
100
101        let dir = RamDirectory::new();
102        let segment_id = SegmentId::new();
103
104        // Build segment using sync builder
105        let config = SegmentBuilderConfig::default();
106        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
107
108        let mut doc = crate::dsl::Document::new();
109        doc.add_text(title, "Hello World");
110        builder.add_document(doc).unwrap();
111
112        let mut doc = crate::dsl::Document::new();
113        doc.add_text(title, "Goodbye World");
114        builder.add_document(doc).unwrap();
115
116        builder.build(&dir, segment_id, None).await.unwrap();
117
118        // Open with async reader
119        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
120            .await
121            .unwrap();
122
123        assert_eq!(reader.num_docs(), 2);
124
125        // Test postings lookup
126        let postings = reader.get_postings(title, b"hello").await.unwrap();
127        assert!(postings.is_some());
128        assert_eq!(postings.unwrap().doc_count(), 1);
129
130        let postings = reader.get_postings(title, b"world").await.unwrap();
131        assert!(postings.is_some());
132        assert_eq!(postings.unwrap().doc_count(), 2);
133
134        // Test document retrieval
135        let doc = reader.doc(0).await.unwrap().unwrap();
136        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
137    }
138
139    #[tokio::test]
140    async fn test_dense_vector_ordinal_tracking() {
141        use crate::query::MultiValueCombiner;
142
143        let mut schema_builder = SchemaBuilder::default();
144        // Use simple add method - defaults to Flat index
145        let embedding = schema_builder.add_dense_vector_field("embedding", 4, true, true);
146        let schema = Arc::new(schema_builder.build());
147
148        let dir = RamDirectory::new();
149        let segment_id = SegmentId::new();
150
151        let config = SegmentBuilderConfig::default();
152        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
153
154        // Doc 0: single vector
155        let mut doc = crate::dsl::Document::new();
156        doc.add_dense_vector(embedding, vec![1.0, 0.0, 0.0, 0.0]);
157        builder.add_document(doc).unwrap();
158
159        // Doc 1: multi-valued vectors (2 vectors)
160        let mut doc = crate::dsl::Document::new();
161        doc.add_dense_vector(embedding, vec![0.0, 1.0, 0.0, 0.0]);
162        doc.add_dense_vector(embedding, vec![0.0, 0.0, 1.0, 0.0]);
163        builder.add_document(doc).unwrap();
164
165        // Doc 2: single vector
166        let mut doc = crate::dsl::Document::new();
167        doc.add_dense_vector(embedding, vec![0.0, 0.0, 0.0, 1.0]);
168        builder.add_document(doc).unwrap();
169
170        builder.build(&dir, segment_id, None).await.unwrap();
171
172        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
173            .await
174            .unwrap();
175
176        // Query close to doc 1's first vector
177        let query = vec![0.0, 0.9, 0.1, 0.0];
178        let results = reader
179            .search_dense_vector(embedding, &query, 10, 0, 1.0, MultiValueCombiner::Max)
180            .await
181            .unwrap();
182
183        // Doc 1 should be in results with ordinal tracking
184        let doc1_result = results.iter().find(|r| r.doc_id == 1);
185        assert!(doc1_result.is_some(), "Doc 1 should be in results");
186
187        let doc1 = doc1_result.unwrap();
188        // Should have 2 ordinals (0 and 1) for the two vectors
189        assert!(
190            doc1.ordinals.len() <= 2,
191            "Doc 1 should have at most 2 ordinals, got {}",
192            doc1.ordinals.len()
193        );
194
195        // Check ordinals are valid (0 or 1)
196        for (ordinal, _score) in &doc1.ordinals {
197            assert!(*ordinal <= 1, "Ordinal should be 0 or 1, got {}", ordinal);
198        }
199    }
200
201    #[tokio::test]
202    async fn test_sparse_vector_ordinal_tracking() {
203        use crate::query::MultiValueCombiner;
204
205        let mut schema_builder = SchemaBuilder::default();
206        let sparse = schema_builder.add_sparse_vector_field("sparse", true, true);
207        let schema = Arc::new(schema_builder.build());
208
209        let dir = RamDirectory::new();
210        let segment_id = SegmentId::new();
211
212        let config = SegmentBuilderConfig::default();
213        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
214
215        // Doc 0: single sparse vector
216        let mut doc = crate::dsl::Document::new();
217        doc.add_sparse_vector(sparse, vec![(0, 1.0), (1, 0.5)]);
218        builder.add_document(doc).unwrap();
219
220        // Doc 1: multi-valued sparse vectors (2 vectors)
221        let mut doc = crate::dsl::Document::new();
222        doc.add_sparse_vector(sparse, vec![(0, 0.8), (2, 0.3)]);
223        doc.add_sparse_vector(sparse, vec![(1, 0.9), (3, 0.4)]);
224        builder.add_document(doc).unwrap();
225
226        // Doc 2: single sparse vector
227        let mut doc = crate::dsl::Document::new();
228        doc.add_sparse_vector(sparse, vec![(2, 1.0), (3, 0.5)]);
229        builder.add_document(doc).unwrap();
230
231        builder.build(&dir, segment_id, None).await.unwrap();
232
233        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
234            .await
235            .unwrap();
236
237        // Query matching dimension 0 via SparseVectorQuery
238        let query = crate::query::SparseVectorQuery::new(sparse, vec![(0, 1.0)])
239            .with_combiner(MultiValueCombiner::Sum);
240        let mut collector = crate::query::TopKCollector::new(10);
241        crate::query::collect_segment(&reader, &query, &mut collector)
242            .await
243            .unwrap();
244        let top_docs = collector.into_sorted_results();
245
246        // Both doc 0 and doc 1 have dimension 0
247        assert!(top_docs.len() >= 2, "Should have at least 2 results");
248    }
249}