Skip to main content

hermes_core/segment/
mod.rs

1pub(crate) mod ann_build;
2#[cfg(feature = "native")]
3mod builder;
4pub(crate) mod format;
5#[cfg(feature = "native")]
6mod merger;
7pub(crate) mod reader;
8#[cfg(feature = "native")]
9pub(crate) mod reorder;
10mod store;
11#[cfg(feature = "native")]
12mod tracker;
13mod types;
14mod vector_data;
15
16#[cfg(feature = "native")]
17pub use builder::{MemoryBreakdown, SegmentBuilder, SegmentBuilderConfig, SegmentBuilderStats};
18#[cfg(feature = "native")]
19pub use merger::{MergeStats, SegmentMerger, delete_segment};
20pub(crate) use reader::BmpIndex;
21pub(crate) use reader::bmp::BMP_SUPERBLOCK_SIZE;
22pub(crate) use reader::bmp::{
23    accumulate_u4_weighted, block_term_postings, compute_block_masks_4bit, find_dim_in_block_data,
24};
25pub(crate) use reader::combine_ordinal_results;
26pub use reader::{SegmentReader, SparseIndex, VectorIndex, VectorSearchResult};
27pub use store::*;
28#[cfg(feature = "native")]
29pub use tracker::{SegmentSnapshot, SegmentTracker};
30pub use types::{FieldStats, SegmentFiles, SegmentId, SegmentMeta, TrainedVectorStructures};
31pub use vector_data::{
32    FlatVectorData, IVFRaBitQIndexData, LazyFlatVectorData, ScaNNIndexData, dequantize_raw,
33};
34
35/// Format byte count as human-readable string
36#[cfg(feature = "native")]
37pub(crate) fn format_bytes(bytes: usize) -> String {
38    if bytes >= 1024 * 1024 * 1024 {
39        format!("{:.2} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
40    } else if bytes >= 1024 * 1024 {
41        format!("{:.2} MB", bytes as f64 / (1024.0 * 1024.0))
42    } else if bytes >= 1024 {
43        format!("{:.2} KB", bytes as f64 / 1024.0)
44    } else {
45        format!("{} B", bytes)
46    }
47}
48
49/// Write adapter that tracks bytes written.
50///
51/// Concrete type so it works with generic `serialize<W: Write>` functions
52/// (unlike `dyn StreamingWriter` which isn't `Sized`).
53#[cfg(feature = "native")]
54pub(crate) struct OffsetWriter {
55    inner: Box<dyn crate::directories::StreamingWriter>,
56    offset: u64,
57}
58
59#[cfg(feature = "native")]
60impl OffsetWriter {
61    pub(crate) fn new(inner: Box<dyn crate::directories::StreamingWriter>) -> Self {
62        Self { inner, offset: 0 }
63    }
64
65    /// Current write position (total bytes written so far).
66    pub(crate) fn offset(&self) -> u64 {
67        self.offset
68    }
69
70    /// Finalize the underlying streaming writer.
71    pub(crate) fn finish(self) -> std::io::Result<()> {
72        self.inner.finish()
73    }
74}
75
76#[cfg(feature = "native")]
77impl std::io::Write for OffsetWriter {
78    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
79        let n = self.inner.write(buf)?;
80        self.offset += n as u64;
81        Ok(n)
82    }
83
84    fn flush(&mut self) -> std::io::Result<()> {
85        self.inner.flush()
86    }
87}
88
89#[cfg(test)]
90#[cfg(feature = "native")]
91mod tests {
92    use super::*;
93    use crate::directories::RamDirectory;
94    use crate::dsl::SchemaBuilder;
95    use std::sync::Arc;
96
97    #[tokio::test]
98    async fn test_async_segment_reader() {
99        let mut schema_builder = SchemaBuilder::default();
100        let title = schema_builder.add_text_field("title", true, true);
101        let schema = Arc::new(schema_builder.build());
102
103        let dir = RamDirectory::new();
104        let segment_id = SegmentId::new();
105
106        // Build segment using sync builder
107        let config = SegmentBuilderConfig::default();
108        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
109
110        let mut doc = crate::dsl::Document::new();
111        doc.add_text(title, "Hello World");
112        builder.add_document(doc).unwrap();
113
114        let mut doc = crate::dsl::Document::new();
115        doc.add_text(title, "Goodbye World");
116        builder.add_document(doc).unwrap();
117
118        builder.build(&dir, segment_id, None).await.unwrap();
119
120        // Open with async reader
121        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
122            .await
123            .unwrap();
124
125        assert_eq!(reader.num_docs(), 2);
126
127        // Test postings lookup
128        let postings = reader.get_postings(title, b"hello").await.unwrap();
129        assert!(postings.is_some());
130        assert_eq!(postings.unwrap().doc_count(), 1);
131
132        let postings = reader.get_postings(title, b"world").await.unwrap();
133        assert!(postings.is_some());
134        assert_eq!(postings.unwrap().doc_count(), 2);
135
136        // Test document retrieval
137        let doc = reader.doc(0).await.unwrap().unwrap();
138        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
139    }
140
141    #[tokio::test]
142    async fn test_dense_vector_ordinal_tracking() {
143        use crate::query::MultiValueCombiner;
144
145        let mut schema_builder = SchemaBuilder::default();
146        // Use simple add method - defaults to Flat index
147        let embedding = schema_builder.add_dense_vector_field("embedding", 4, true, true);
148        let schema = Arc::new(schema_builder.build());
149
150        let dir = RamDirectory::new();
151        let segment_id = SegmentId::new();
152
153        let config = SegmentBuilderConfig::default();
154        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
155
156        // Doc 0: single vector
157        let mut doc = crate::dsl::Document::new();
158        doc.add_dense_vector(embedding, vec![1.0, 0.0, 0.0, 0.0]);
159        builder.add_document(doc).unwrap();
160
161        // Doc 1: multi-valued vectors (2 vectors)
162        let mut doc = crate::dsl::Document::new();
163        doc.add_dense_vector(embedding, vec![0.0, 1.0, 0.0, 0.0]);
164        doc.add_dense_vector(embedding, vec![0.0, 0.0, 1.0, 0.0]);
165        builder.add_document(doc).unwrap();
166
167        // Doc 2: single vector
168        let mut doc = crate::dsl::Document::new();
169        doc.add_dense_vector(embedding, vec![0.0, 0.0, 0.0, 1.0]);
170        builder.add_document(doc).unwrap();
171
172        builder.build(&dir, segment_id, None).await.unwrap();
173
174        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
175            .await
176            .unwrap();
177
178        // Query close to doc 1's first vector
179        let query = vec![0.0, 0.9, 0.1, 0.0];
180        let results = reader
181            .search_dense_vector(embedding, &query, 10, 0, 1.0, MultiValueCombiner::Max)
182            .await
183            .unwrap();
184
185        // Doc 1 should be in results with ordinal tracking
186        let doc1_result = results.iter().find(|r| r.doc_id == 1);
187        assert!(doc1_result.is_some(), "Doc 1 should be in results");
188
189        let doc1 = doc1_result.unwrap();
190        // Should have 2 ordinals (0 and 1) for the two vectors
191        assert!(
192            doc1.ordinals.len() <= 2,
193            "Doc 1 should have at most 2 ordinals, got {}",
194            doc1.ordinals.len()
195        );
196
197        // Check ordinals are valid (0 or 1)
198        for (ordinal, _score) in &doc1.ordinals {
199            assert!(*ordinal <= 1, "Ordinal should be 0 or 1, got {}", ordinal);
200        }
201    }
202
203    #[tokio::test]
204    async fn test_sparse_vector_ordinal_tracking() {
205        use crate::query::MultiValueCombiner;
206
207        let mut schema_builder = SchemaBuilder::default();
208        let sparse = schema_builder.add_sparse_vector_field("sparse", true, true);
209        let schema = Arc::new(schema_builder.build());
210
211        let dir = RamDirectory::new();
212        let segment_id = SegmentId::new();
213
214        let config = SegmentBuilderConfig::default();
215        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
216
217        // Doc 0: single sparse vector
218        let mut doc = crate::dsl::Document::new();
219        doc.add_sparse_vector(sparse, vec![(0, 1.0), (1, 0.5)]);
220        builder.add_document(doc).unwrap();
221
222        // Doc 1: multi-valued sparse vectors (2 vectors)
223        let mut doc = crate::dsl::Document::new();
224        doc.add_sparse_vector(sparse, vec![(0, 0.8), (2, 0.3)]);
225        doc.add_sparse_vector(sparse, vec![(1, 0.9), (3, 0.4)]);
226        builder.add_document(doc).unwrap();
227
228        // Doc 2: single sparse vector
229        let mut doc = crate::dsl::Document::new();
230        doc.add_sparse_vector(sparse, vec![(2, 1.0), (3, 0.5)]);
231        builder.add_document(doc).unwrap();
232
233        builder.build(&dir, segment_id, None).await.unwrap();
234
235        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
236            .await
237            .unwrap();
238
239        // Query matching dimension 0 via SparseVectorQuery
240        let query = crate::query::SparseVectorQuery::new(sparse, vec![(0, 1.0)])
241            .with_combiner(MultiValueCombiner::Sum);
242        let mut collector = crate::query::TopKCollector::new(10);
243        crate::query::collect_segment(&reader, &query, &mut collector)
244            .await
245            .unwrap();
246        let top_docs = collector.into_sorted_results();
247
248        // Both doc 0 and doc 1 have dimension 0
249        assert!(top_docs.len() >= 2, "Should have at least 2 results");
250    }
251}