Skip to main content

hermes_core/segment/
mod.rs

1pub(crate) mod ann_build;
2#[cfg(feature = "native")]
3mod builder;
4pub(crate) mod format;
5#[cfg(feature = "native")]
6mod merger;
7mod reader;
8mod store;
9#[cfg(feature = "native")]
10mod tracker;
11mod types;
12mod vector_data;
13
14#[cfg(feature = "native")]
15pub use builder::{MemoryBreakdown, SegmentBuilder, SegmentBuilderConfig, SegmentBuilderStats};
16#[cfg(feature = "native")]
17pub use merger::{MergeStats, SegmentMerger, delete_segment};
18pub(crate) use reader::combine_ordinal_results;
19pub use reader::{SegmentReader, SparseIndex, VectorIndex, VectorSearchResult};
20pub use store::*;
21#[cfg(feature = "native")]
22pub use tracker::{SegmentSnapshot, SegmentTracker};
23pub use types::{FieldStats, SegmentFiles, SegmentId, SegmentMeta, TrainedVectorStructures};
24pub use vector_data::{
25    FlatVectorData, IVFRaBitQIndexData, LazyFlatVectorData, ScaNNIndexData, dequantize_raw,
26};
27
28/// Format byte count as human-readable string
29#[cfg(feature = "native")]
30pub(crate) fn format_bytes(bytes: usize) -> String {
31    if bytes >= 1024 * 1024 * 1024 {
32        format!("{:.2} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
33    } else if bytes >= 1024 * 1024 {
34        format!("{:.2} MB", bytes as f64 / (1024.0 * 1024.0))
35    } else if bytes >= 1024 {
36        format!("{:.2} KB", bytes as f64 / 1024.0)
37    } else {
38        format!("{} B", bytes)
39    }
40}
41
42/// Write adapter that tracks bytes written.
43///
44/// Concrete type so it works with generic `serialize<W: Write>` functions
45/// (unlike `dyn StreamingWriter` which isn't `Sized`).
46#[cfg(feature = "native")]
47pub(crate) struct OffsetWriter {
48    inner: Box<dyn crate::directories::StreamingWriter>,
49    offset: u64,
50}
51
52#[cfg(feature = "native")]
53impl OffsetWriter {
54    pub(crate) fn new(inner: Box<dyn crate::directories::StreamingWriter>) -> Self {
55        Self { inner, offset: 0 }
56    }
57
58    /// Current write position (total bytes written so far).
59    pub(crate) fn offset(&self) -> u64 {
60        self.offset
61    }
62
63    /// Finalize the underlying streaming writer.
64    pub(crate) fn finish(self) -> std::io::Result<()> {
65        self.inner.finish()
66    }
67}
68
69#[cfg(feature = "native")]
70impl std::io::Write for OffsetWriter {
71    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
72        let n = self.inner.write(buf)?;
73        self.offset += n as u64;
74        Ok(n)
75    }
76
77    fn flush(&mut self) -> std::io::Result<()> {
78        self.inner.flush()
79    }
80}
81
82#[cfg(test)]
83#[cfg(feature = "native")]
84mod tests {
85    use super::*;
86    use crate::directories::RamDirectory;
87    use crate::dsl::SchemaBuilder;
88    use std::sync::Arc;
89
90    #[tokio::test]
91    async fn test_async_segment_reader() {
92        let mut schema_builder = SchemaBuilder::default();
93        let title = schema_builder.add_text_field("title", true, true);
94        let schema = Arc::new(schema_builder.build());
95
96        let dir = RamDirectory::new();
97        let segment_id = SegmentId::new();
98
99        // Build segment using sync builder
100        let config = SegmentBuilderConfig::default();
101        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
102
103        let mut doc = crate::dsl::Document::new();
104        doc.add_text(title, "Hello World");
105        builder.add_document(doc).unwrap();
106
107        let mut doc = crate::dsl::Document::new();
108        doc.add_text(title, "Goodbye World");
109        builder.add_document(doc).unwrap();
110
111        builder.build(&dir, segment_id, None).await.unwrap();
112
113        // Open with async reader
114        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
115            .await
116            .unwrap();
117
118        assert_eq!(reader.num_docs(), 2);
119
120        // Test postings lookup
121        let postings = reader.get_postings(title, b"hello").await.unwrap();
122        assert!(postings.is_some());
123        assert_eq!(postings.unwrap().doc_count(), 1);
124
125        let postings = reader.get_postings(title, b"world").await.unwrap();
126        assert!(postings.is_some());
127        assert_eq!(postings.unwrap().doc_count(), 2);
128
129        // Test document retrieval
130        let doc = reader.doc(0).await.unwrap().unwrap();
131        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
132    }
133
134    #[tokio::test]
135    async fn test_dense_vector_ordinal_tracking() {
136        use crate::query::MultiValueCombiner;
137
138        let mut schema_builder = SchemaBuilder::default();
139        // Use simple add method - defaults to Flat index
140        let embedding = schema_builder.add_dense_vector_field("embedding", 4, true, true);
141        let schema = Arc::new(schema_builder.build());
142
143        let dir = RamDirectory::new();
144        let segment_id = SegmentId::new();
145
146        let config = SegmentBuilderConfig::default();
147        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
148
149        // Doc 0: single vector
150        let mut doc = crate::dsl::Document::new();
151        doc.add_dense_vector(embedding, vec![1.0, 0.0, 0.0, 0.0]);
152        builder.add_document(doc).unwrap();
153
154        // Doc 1: multi-valued vectors (2 vectors)
155        let mut doc = crate::dsl::Document::new();
156        doc.add_dense_vector(embedding, vec![0.0, 1.0, 0.0, 0.0]);
157        doc.add_dense_vector(embedding, vec![0.0, 0.0, 1.0, 0.0]);
158        builder.add_document(doc).unwrap();
159
160        // Doc 2: single vector
161        let mut doc = crate::dsl::Document::new();
162        doc.add_dense_vector(embedding, vec![0.0, 0.0, 0.0, 1.0]);
163        builder.add_document(doc).unwrap();
164
165        builder.build(&dir, segment_id, None).await.unwrap();
166
167        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
168            .await
169            .unwrap();
170
171        // Query close to doc 1's first vector
172        let query = vec![0.0, 0.9, 0.1, 0.0];
173        let results = reader
174            .search_dense_vector(embedding, &query, 10, 0, 1.0, MultiValueCombiner::Max)
175            .await
176            .unwrap();
177
178        // Doc 1 should be in results with ordinal tracking
179        let doc1_result = results.iter().find(|r| r.doc_id == 1);
180        assert!(doc1_result.is_some(), "Doc 1 should be in results");
181
182        let doc1 = doc1_result.unwrap();
183        // Should have 2 ordinals (0 and 1) for the two vectors
184        assert!(
185            doc1.ordinals.len() <= 2,
186            "Doc 1 should have at most 2 ordinals, got {}",
187            doc1.ordinals.len()
188        );
189
190        // Check ordinals are valid (0 or 1)
191        for (ordinal, _score) in &doc1.ordinals {
192            assert!(*ordinal <= 1, "Ordinal should be 0 or 1, got {}", ordinal);
193        }
194    }
195
196    #[tokio::test]
197    async fn test_sparse_vector_ordinal_tracking() {
198        use crate::query::MultiValueCombiner;
199
200        let mut schema_builder = SchemaBuilder::default();
201        let sparse = schema_builder.add_sparse_vector_field("sparse", true, true);
202        let schema = Arc::new(schema_builder.build());
203
204        let dir = RamDirectory::new();
205        let segment_id = SegmentId::new();
206
207        let config = SegmentBuilderConfig::default();
208        let mut builder = SegmentBuilder::new(Arc::clone(&schema), config).unwrap();
209
210        // Doc 0: single sparse vector
211        let mut doc = crate::dsl::Document::new();
212        doc.add_sparse_vector(sparse, vec![(0, 1.0), (1, 0.5)]);
213        builder.add_document(doc).unwrap();
214
215        // Doc 1: multi-valued sparse vectors (2 vectors)
216        let mut doc = crate::dsl::Document::new();
217        doc.add_sparse_vector(sparse, vec![(0, 0.8), (2, 0.3)]);
218        doc.add_sparse_vector(sparse, vec![(1, 0.9), (3, 0.4)]);
219        builder.add_document(doc).unwrap();
220
221        // Doc 2: single sparse vector
222        let mut doc = crate::dsl::Document::new();
223        doc.add_sparse_vector(sparse, vec![(2, 1.0), (3, 0.5)]);
224        builder.add_document(doc).unwrap();
225
226        builder.build(&dir, segment_id, None).await.unwrap();
227
228        let reader = SegmentReader::open(&dir, segment_id, schema.clone(), 16)
229            .await
230            .unwrap();
231
232        // Query matching dimension 0 via SparseVectorQuery
233        let query = crate::query::SparseVectorQuery::new(sparse, vec![(0, 1.0)])
234            .with_combiner(MultiValueCombiner::Sum);
235        let mut collector = crate::query::TopKCollector::new(10);
236        crate::query::collect_segment(&reader, &query, &mut collector)
237            .await
238            .unwrap();
239        let top_docs = collector.into_sorted_results();
240
241        // Both doc 0 and doc 1 have dimension 0
242        assert!(top_docs.len() >= 2, "Should have at least 2 results");
243    }
244}