Skip to main content

hermes_core/index/
metadata.rs

1//! Unified index metadata - segments list + vector index state
2//!
3//! This module manages all index-level metadata in a single `metadata.json` file:
4//! - List of committed segments
5//! - Vector index state per field (Flat/Built)
6//! - Trained centroids/codebooks paths
7//!
8//! The workflow is:
9//! 1. During accumulation: segments store Flat vectors, state is Flat
10//! 2. When threshold crossed: train ONCE, update state to Built
11//! 3. On index open: load metadata, skip re-training if already built
12
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::path::Path;
16
17use crate::dsl::VectorIndexType;
18use crate::error::{Error, Result};
19use crate::schema::Schema;
20
21/// Metadata file name at index level
22pub const INDEX_META_FILENAME: &str = "metadata.json";
23
24/// State of vector index for a field
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
26pub enum VectorIndexState {
27    /// Accumulating vectors - using Flat (brute-force) search
28    #[default]
29    Flat,
30    /// Index structures built - using ANN search
31    Built {
32        /// Total vector count when training happened
33        vector_count: usize,
34        /// Number of clusters used
35        num_clusters: usize,
36    },
37}
38
39/// Per-segment metadata stored in index metadata
40/// This allows merge decisions without loading segment files
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct SegmentMetaInfo {
43    /// Number of documents in this segment
44    pub num_docs: u32,
45}
46
47/// Per-field vector index metadata
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct FieldVectorMeta {
50    /// Field ID
51    pub field_id: u32,
52    /// Configured index type (target type when built)
53    pub index_type: VectorIndexType,
54    /// Current state
55    pub state: VectorIndexState,
56    /// Path to centroids file (relative to index dir)
57    #[serde(skip_serializing_if = "Option::is_none")]
58    pub centroids_file: Option<String>,
59    /// Path to codebook file (relative to index dir, for ScaNN)
60    #[serde(skip_serializing_if = "Option::is_none")]
61    pub codebook_file: Option<String>,
62}
63
64/// Unified index metadata - single source of truth for index state
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct IndexMetadata {
67    /// Version for compatibility
68    pub version: u32,
69    /// Index schema
70    pub schema: Schema,
71    /// Segment metadata: segment_id -> info (doc count, etc.)
72    /// Using HashMap allows O(1) lookup and stores doc counts for merge decisions
73    #[serde(default)]
74    pub segment_metas: HashMap<String, SegmentMetaInfo>,
75    /// Per-field vector index metadata
76    #[serde(default)]
77    pub vector_fields: HashMap<u32, FieldVectorMeta>,
78    /// Total vectors across all segments (updated on commit)
79    #[serde(default)]
80    pub total_vectors: usize,
81}
82
83impl IndexMetadata {
84    /// Create new metadata with schema
85    pub fn new(schema: Schema) -> Self {
86        Self {
87            version: 1,
88            schema,
89            segment_metas: HashMap::new(),
90            vector_fields: HashMap::new(),
91            total_vectors: 0,
92        }
93    }
94
95    /// Get segment IDs as a sorted Vec (deterministic ordering for doc_id_offset assignment)
96    pub fn segment_ids(&self) -> Vec<String> {
97        let mut ids: Vec<String> = self.segment_metas.keys().cloned().collect();
98        ids.sort();
99        ids
100    }
101
102    /// Add or update a segment with its doc count
103    pub fn add_segment(&mut self, segment_id: String, num_docs: u32) {
104        self.segment_metas
105            .insert(segment_id, SegmentMetaInfo { num_docs });
106    }
107
108    /// Remove a segment
109    pub fn remove_segment(&mut self, segment_id: &str) {
110        self.segment_metas.remove(segment_id);
111    }
112
113    /// Check if segment exists
114    pub fn has_segment(&self, segment_id: &str) -> bool {
115        self.segment_metas.contains_key(segment_id)
116    }
117
118    /// Get segment doc count
119    pub fn segment_doc_count(&self, segment_id: &str) -> Option<u32> {
120        self.segment_metas.get(segment_id).map(|m| m.num_docs)
121    }
122
123    /// Check if a field has been built
124    pub fn is_field_built(&self, field_id: u32) -> bool {
125        self.vector_fields
126            .get(&field_id)
127            .map(|f| matches!(f.state, VectorIndexState::Built { .. }))
128            .unwrap_or(false)
129    }
130
131    /// Get field metadata
132    pub fn get_field_meta(&self, field_id: u32) -> Option<&FieldVectorMeta> {
133        self.vector_fields.get(&field_id)
134    }
135
136    /// Initialize field metadata (called when field is first seen)
137    pub fn init_field(&mut self, field_id: u32, index_type: VectorIndexType) {
138        self.vector_fields
139            .entry(field_id)
140            .or_insert(FieldVectorMeta {
141                field_id,
142                index_type,
143                state: VectorIndexState::Flat,
144                centroids_file: None,
145                codebook_file: None,
146            });
147    }
148
149    /// Mark field as built with trained structures
150    pub fn mark_field_built(
151        &mut self,
152        field_id: u32,
153        vector_count: usize,
154        num_clusters: usize,
155        centroids_file: String,
156        codebook_file: Option<String>,
157    ) {
158        if let Some(field) = self.vector_fields.get_mut(&field_id) {
159            field.state = VectorIndexState::Built {
160                vector_count,
161                num_clusters,
162            };
163            field.centroids_file = Some(centroids_file);
164            field.codebook_file = codebook_file;
165        }
166    }
167
168    /// Check if field should be built based on threshold
169    pub fn should_build_field(&self, field_id: u32, threshold: usize) -> bool {
170        // Don't build if already built
171        if self.is_field_built(field_id) {
172            return false;
173        }
174        // Build if we have enough vectors
175        self.total_vectors >= threshold
176    }
177
178    /// Load from directory
179    pub async fn load<D: crate::directories::Directory>(dir: &D) -> Result<Self> {
180        let path = Path::new(INDEX_META_FILENAME);
181        let slice = dir.open_read(path).await?;
182        let bytes = slice.read_bytes().await?;
183        serde_json::from_slice(bytes.as_slice()).map_err(|e| Error::Serialization(e.to_string()))
184    }
185
186    /// Save to directory
187    pub async fn save<D: crate::directories::DirectoryWriter>(&self, dir: &D) -> Result<()> {
188        let path = Path::new(INDEX_META_FILENAME);
189        let bytes =
190            serde_json::to_vec_pretty(self).map_err(|e| Error::Serialization(e.to_string()))?;
191        dir.write(path, &bytes).await.map_err(Error::Io)
192    }
193
194    /// Load trained centroids and codebooks from index-level files
195    ///
196    /// Returns (centroids_map, codebooks_map) for fields that are Built
197    pub async fn load_trained_structures<D: crate::directories::Directory>(
198        &self,
199        dir: &D,
200    ) -> (
201        rustc_hash::FxHashMap<u32, std::sync::Arc<crate::structures::CoarseCentroids>>,
202        rustc_hash::FxHashMap<u32, std::sync::Arc<crate::structures::PQCodebook>>,
203    ) {
204        use std::sync::Arc;
205
206        let mut centroids = rustc_hash::FxHashMap::default();
207        let mut codebooks = rustc_hash::FxHashMap::default();
208
209        for (field_id, field_meta) in &self.vector_fields {
210            if !matches!(field_meta.state, VectorIndexState::Built { .. }) {
211                continue;
212            }
213
214            // Load centroids
215            if let Some(ref file) = field_meta.centroids_file
216                && let Ok(slice) = dir.open_read(Path::new(file)).await
217                && let Ok(bytes) = slice.read_bytes().await
218                && let Ok(c) =
219                    serde_json::from_slice::<crate::structures::CoarseCentroids>(bytes.as_slice())
220            {
221                centroids.insert(*field_id, Arc::new(c));
222            }
223
224            // Load codebook (for ScaNN)
225            if let Some(ref file) = field_meta.codebook_file
226                && let Ok(slice) = dir.open_read(Path::new(file)).await
227                && let Ok(bytes) = slice.read_bytes().await
228                && let Ok(c) =
229                    serde_json::from_slice::<crate::structures::PQCodebook>(bytes.as_slice())
230            {
231                codebooks.insert(*field_id, Arc::new(c));
232            }
233        }
234
235        (centroids, codebooks)
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242
243    fn test_schema() -> Schema {
244        Schema::default()
245    }
246
247    #[test]
248    fn test_metadata_init() {
249        let mut meta = IndexMetadata::new(test_schema());
250        assert_eq!(meta.total_vectors, 0);
251        assert!(meta.segment_metas.is_empty());
252        assert!(!meta.is_field_built(0));
253
254        meta.init_field(0, VectorIndexType::IvfRaBitQ);
255        assert!(!meta.is_field_built(0));
256        assert!(meta.vector_fields.contains_key(&0));
257    }
258
259    #[test]
260    fn test_metadata_segments() {
261        let mut meta = IndexMetadata::new(test_schema());
262        meta.add_segment("abc123".to_string(), 50);
263        meta.add_segment("def456".to_string(), 100);
264        assert_eq!(meta.segment_metas.len(), 2);
265        assert_eq!(meta.segment_doc_count("abc123"), Some(50));
266        assert_eq!(meta.segment_doc_count("def456"), Some(100));
267
268        // Overwrites existing
269        meta.add_segment("abc123".to_string(), 75);
270        assert_eq!(meta.segment_metas.len(), 2);
271        assert_eq!(meta.segment_doc_count("abc123"), Some(75));
272
273        meta.remove_segment("abc123");
274        assert_eq!(meta.segment_metas.len(), 1);
275        assert!(meta.has_segment("def456"));
276        assert!(!meta.has_segment("abc123"));
277    }
278
279    #[test]
280    fn test_mark_field_built() {
281        let mut meta = IndexMetadata::new(test_schema());
282        meta.init_field(0, VectorIndexType::IvfRaBitQ);
283        meta.total_vectors = 10000;
284
285        assert!(!meta.is_field_built(0));
286
287        meta.mark_field_built(0, 10000, 256, "field_0_centroids.bin".to_string(), None);
288
289        assert!(meta.is_field_built(0));
290        let field = meta.get_field_meta(0).unwrap();
291        assert_eq!(
292            field.centroids_file.as_deref(),
293            Some("field_0_centroids.bin")
294        );
295    }
296
297    #[test]
298    fn test_should_build_field() {
299        let mut meta = IndexMetadata::new(test_schema());
300        meta.init_field(0, VectorIndexType::IvfRaBitQ);
301
302        // Below threshold
303        meta.total_vectors = 500;
304        assert!(!meta.should_build_field(0, 1000));
305
306        // Above threshold
307        meta.total_vectors = 1500;
308        assert!(meta.should_build_field(0, 1000));
309
310        // Already built - should not build again
311        meta.mark_field_built(0, 1500, 256, "centroids.bin".to_string(), None);
312        assert!(!meta.should_build_field(0, 1000));
313    }
314
315    #[test]
316    fn test_serialization() {
317        let mut meta = IndexMetadata::new(test_schema());
318        meta.add_segment("seg1".to_string(), 100);
319        meta.init_field(0, VectorIndexType::IvfRaBitQ);
320        meta.total_vectors = 5000;
321
322        let json = serde_json::to_string_pretty(&meta).unwrap();
323        let loaded: IndexMetadata = serde_json::from_str(&json).unwrap();
324
325        assert_eq!(loaded.segment_ids().len(), meta.segment_ids().len());
326        assert_eq!(loaded.segment_doc_count("seg1"), Some(100));
327        assert_eq!(loaded.total_vectors, meta.total_vectors);
328        assert!(loaded.vector_fields.contains_key(&0));
329    }
330}