Skip to main content

hermes_core/segment/
mod.rs

1pub(crate) mod ann_build;
2#[cfg(feature = "native")]
3mod builder;
4pub(crate) mod format;
5#[cfg(feature = "native")]
6mod merger;
7mod reader;
8mod store;
9#[cfg(feature = "native")]
10mod tracker;
11mod types;
12mod vector_data;
13
14#[cfg(feature = "native")]
15pub use builder::simhash::{majority_simhash, simhash_from_sparse_vector};
16#[cfg(feature = "native")]
17pub use builder::{MemoryBreakdown, SegmentBuilder, SegmentBuilderConfig, SegmentBuilderStats};
18#[cfg(feature = "native")]
19pub use merger::{MergeStats, SegmentMerger, delete_segment};
20pub(crate) use reader::BmpIndex;
21pub(crate) use reader::bmp::BMP_SUPERBLOCK_SIZE;
22pub(crate) use reader::bmp::{
23    accumulate_u4_weighted, accumulate_u8_weighted, block_term_postings, compute_block_masks_4bit,
24    find_dim_in_block_data,
25};
26pub(crate) use reader::combine_ordinal_results;
27pub use reader::{SegmentReader, SparseIndex, VectorIndex, VectorSearchResult};
28pub use store::*;
29#[cfg(feature = "native")]
30pub use tracker::{SegmentSnapshot, SegmentTracker};
31pub use types::{FieldStats, SegmentFiles, SegmentId, SegmentMeta, TrainedVectorStructures};
32pub use vector_data::{
33    FlatVectorData, IVFRaBitQIndexData, LazyFlatVectorData, ScaNNIndexData, dequantize_raw,
34};
35
36/// Format byte count as human-readable string
37#[cfg(feature = "native")]
38pub(crate) fn format_bytes(bytes: usize) -> String {
39    if bytes >= 1024 * 1024 * 1024 {
40        format!("{:.2} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
41    } else if bytes >= 1024 * 1024 {
42        format!("{:.2} MB", bytes as f64 / (1024.0 * 1024.0))
43    } else if bytes >= 1024 {
44        format!("{:.2} KB", bytes as f64 / 1024.0)
45    } else {
46        format!("{} B", bytes)
47    }
48}
49
50/// Write adapter that tracks bytes written.
51///
52/// Concrete type so it works with generic `serialize<W: Write>` functions
53/// (unlike `dyn StreamingWriter` which isn't `Sized`).
54#[cfg(feature = "native")]
55pub(crate) struct OffsetWriter {
56    inner: Box<dyn crate::directories::StreamingWriter>,
57    offset: u64,
58}
59
60#[cfg(feature = "native")]
61impl OffsetWriter {
62    pub(crate) fn new(inner: Box<dyn crate::directories::StreamingWriter>) -> Self {
63        Self { inner, offset: 0 }
64    }
65
66    /// Current write position (total bytes written so far).
67    pub(crate) fn offset(&self) -> u64 {
68        self.offset
69    }
70
71    /// Finalize the underlying streaming writer.
72    pub(crate) fn finish(self) -> std::io::Result<()> {
73        self.inner.finish()
74    }
75}
76
77#[cfg(feature = "native")]
78impl std::io::Write for OffsetWriter {
79    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
80        let n = self.inner.write(buf)?;
81        self.offset += n as u64;
82        Ok(n)
83    }
84
85    fn flush(&mut self) -> std::io::Result<()> {
86        self.inner.flush()
87    }
88}
89
90#[cfg(test)]
91#[cfg(feature = "native")]
92mod tests {
93    use super::*;
94    use crate::directories::RamDirectory;
95    use crate::dsl::SchemaBuilder;
96    use std::sync::Arc;
97
98    #[tokio::test]
99    async fn test_async_segment_reader() {
100        let mut schema_builder = SchemaBuilder::default();
101        let title = schema_builder.add_text_field("title", true, true);
102        let schema = Arc::new(schema_builder.build());
103
104        let dir = RamDirectory::new();
105        let segment_id = SegmentId::new();
106
107        // Build segment using sync builder
108        let config = SegmentBuilderConfig::default();
109        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
110
111        let mut doc = crate::dsl::Document::new();
112        doc.add_text(title, "Hello World");
113        builder.add_document(doc).unwrap();
114
115        let mut doc = crate::dsl::Document::new();
116        doc.add_text(title, "Goodbye World");
117        builder.add_document(doc).unwrap();
118
119        builder.build(&dir, segment_id, None).await.unwrap();
120
121        // Open with async reader
122        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
123            .await
124            .unwrap();
125
126        assert_eq!(reader.num_docs(), 2);
127
128        // Test postings lookup
129        let postings = reader.get_postings(title, b"hello").await.unwrap();
130        assert!(postings.is_some());
131        assert_eq!(postings.unwrap().doc_count(), 1);
132
133        let postings = reader.get_postings(title, b"world").await.unwrap();
134        assert!(postings.is_some());
135        assert_eq!(postings.unwrap().doc_count(), 2);
136
137        // Test document retrieval
138        let doc = reader.doc(0).await.unwrap().unwrap();
139        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
140    }
141
142    #[tokio::test]
143    async fn test_dense_vector_ordinal_tracking() {
144        use crate::query::MultiValueCombiner;
145
146        let mut schema_builder = SchemaBuilder::default();
147        // Use simple add method - defaults to Flat index
148        let embedding = schema_builder.add_dense_vector_field("embedding", 4, true, true);
149        let schema = Arc::new(schema_builder.build());
150
151        let dir = RamDirectory::new();
152        let segment_id = SegmentId::new();
153
154        let config = SegmentBuilderConfig::default();
155        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
156
157        // Doc 0: single vector
158        let mut doc = crate::dsl::Document::new();
159        doc.add_dense_vector(embedding, vec![1.0, 0.0, 0.0, 0.0]);
160        builder.add_document(doc).unwrap();
161
162        // Doc 1: multi-valued vectors (2 vectors)
163        let mut doc = crate::dsl::Document::new();
164        doc.add_dense_vector(embedding, vec![0.0, 1.0, 0.0, 0.0]);
165        doc.add_dense_vector(embedding, vec![0.0, 0.0, 1.0, 0.0]);
166        builder.add_document(doc).unwrap();
167
168        // Doc 2: single vector
169        let mut doc = crate::dsl::Document::new();
170        doc.add_dense_vector(embedding, vec![0.0, 0.0, 0.0, 1.0]);
171        builder.add_document(doc).unwrap();
172
173        builder.build(&dir, segment_id, None).await.unwrap();
174
175        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
176            .await
177            .unwrap();
178
179        // Query close to doc 1's first vector
180        let query = vec![0.0, 0.9, 0.1, 0.0];
181        let results = reader
182            .search_dense_vector(embedding, &query, 10, 0, 1.0, MultiValueCombiner::Max)
183            .await
184            .unwrap();
185
186        // Doc 1 should be in results with ordinal tracking
187        let doc1_result = results.iter().find(|r| r.doc_id == 1);
188        assert!(doc1_result.is_some(), "Doc 1 should be in results");
189
190        let doc1 = doc1_result.unwrap();
191        // Should have 2 ordinals (0 and 1) for the two vectors
192        assert!(
193            doc1.ordinals.len() <= 2,
194            "Doc 1 should have at most 2 ordinals, got {}",
195            doc1.ordinals.len()
196        );
197
198        // Check ordinals are valid (0 or 1)
199        for (ordinal, _score) in &doc1.ordinals {
200            assert!(*ordinal <= 1, "Ordinal should be 0 or 1, got {}", ordinal);
201        }
202    }
203
204    #[tokio::test]
205    async fn test_sparse_vector_ordinal_tracking() {
206        use crate::query::MultiValueCombiner;
207
208        let mut schema_builder = SchemaBuilder::default();
209        let sparse = schema_builder.add_sparse_vector_field("sparse", true, true);
210        let schema = Arc::new(schema_builder.build());
211
212        let dir = RamDirectory::new();
213        let segment_id = SegmentId::new();
214
215        let config = SegmentBuilderConfig::default();
216        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
217
218        // Doc 0: single sparse vector
219        let mut doc = crate::dsl::Document::new();
220        doc.add_sparse_vector(sparse, vec![(0, 1.0), (1, 0.5)]);
221        builder.add_document(doc).unwrap();
222
223        // Doc 1: multi-valued sparse vectors (2 vectors)
224        let mut doc = crate::dsl::Document::new();
225        doc.add_sparse_vector(sparse, vec![(0, 0.8), (2, 0.3)]);
226        doc.add_sparse_vector(sparse, vec![(1, 0.9), (3, 0.4)]);
227        builder.add_document(doc).unwrap();
228
229        // Doc 2: single sparse vector
230        let mut doc = crate::dsl::Document::new();
231        doc.add_sparse_vector(sparse, vec![(2, 1.0), (3, 0.5)]);
232        builder.add_document(doc).unwrap();
233
234        builder.build(&dir, segment_id, None).await.unwrap();
235
236        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
237            .await
238            .unwrap();
239
240        // Query matching dimension 0 via SparseVectorQuery
241        let query = crate::query::SparseVectorQuery::new(sparse, vec![(0, 1.0)])
242            .with_combiner(MultiValueCombiner::Sum);
243        let mut collector = crate::query::TopKCollector::new(10);
244        crate::query::collect_segment(&reader, &query, &mut collector)
245            .await
246            .unwrap();
247        let top_docs = collector.into_sorted_results();
248
249        // Both doc 0 and doc 1 have dimension 0
250        assert!(top_docs.len() >= 2, "Should have at least 2 results");
251    }
252}