Skip to main content

hermes_core/index/
metadata.rs

1//! Unified index metadata - segments list + vector index state
2//!
3//! This module manages all index-level metadata in a single `metadata.json` file:
4//! - List of committed segments
5//! - Vector index state per field (Flat/Built)
6//! - Trained centroids/codebooks paths
7//!
8//! The workflow is:
9//! 1. During accumulation: segments store Flat vectors, state is Flat
10//! 2. When threshold crossed: train ONCE, update state to Built
11//! 3. On index open: load metadata, skip re-training if already built
12
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::path::Path;
16
17use crate::dsl::VectorIndexType;
18use crate::error::{Error, Result};
19use crate::schema::Schema;
20
21/// Metadata file name at index level
22pub const INDEX_META_FILENAME: &str = "metadata.json";
23
24/// State of vector index for a field
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
26pub enum VectorIndexState {
27    /// Accumulating vectors - using Flat (brute-force) search
28    #[default]
29    Flat,
30    /// Index structures built - using ANN search
31    Built {
32        /// Total vector count when training happened
33        vector_count: usize,
34        /// Number of clusters used
35        num_clusters: usize,
36    },
37}
38
39/// Per-field vector index metadata
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldVectorMeta {
42    /// Field ID
43    pub field_id: u32,
44    /// Configured index type (target type when built)
45    pub index_type: VectorIndexType,
46    /// Current state
47    pub state: VectorIndexState,
48    /// Path to centroids file (relative to index dir)
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub centroids_file: Option<String>,
51    /// Path to codebook file (relative to index dir, for ScaNN)
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub codebook_file: Option<String>,
54}
55
56/// Unified index metadata - single source of truth for index state
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct IndexMetadata {
59    /// Version for compatibility
60    pub version: u32,
61    /// Index schema
62    pub schema: Schema,
63    /// List of committed segment IDs (hex strings)
64    pub segments: Vec<String>,
65    /// Per-field vector index metadata
66    #[serde(default)]
67    pub vector_fields: HashMap<u32, FieldVectorMeta>,
68    /// Total vectors across all segments (updated on commit)
69    #[serde(default)]
70    pub total_vectors: usize,
71}
72
73impl IndexMetadata {
74    /// Create new metadata with schema
75    pub fn new(schema: Schema) -> Self {
76        Self {
77            version: 1,
78            schema,
79            segments: Vec::new(),
80            vector_fields: HashMap::new(),
81            total_vectors: 0,
82        }
83    }
84
85    /// Check if a field has been built
86    pub fn is_field_built(&self, field_id: u32) -> bool {
87        self.vector_fields
88            .get(&field_id)
89            .map(|f| matches!(f.state, VectorIndexState::Built { .. }))
90            .unwrap_or(false)
91    }
92
93    /// Get field metadata
94    pub fn get_field_meta(&self, field_id: u32) -> Option<&FieldVectorMeta> {
95        self.vector_fields.get(&field_id)
96    }
97
98    /// Initialize field metadata (called when field is first seen)
99    pub fn init_field(&mut self, field_id: u32, index_type: VectorIndexType) {
100        self.vector_fields
101            .entry(field_id)
102            .or_insert(FieldVectorMeta {
103                field_id,
104                index_type,
105                state: VectorIndexState::Flat,
106                centroids_file: None,
107                codebook_file: None,
108            });
109    }
110
111    /// Mark field as built with trained structures
112    pub fn mark_field_built(
113        &mut self,
114        field_id: u32,
115        vector_count: usize,
116        num_clusters: usize,
117        centroids_file: String,
118        codebook_file: Option<String>,
119    ) {
120        if let Some(field) = self.vector_fields.get_mut(&field_id) {
121            field.state = VectorIndexState::Built {
122                vector_count,
123                num_clusters,
124            };
125            field.centroids_file = Some(centroids_file);
126            field.codebook_file = codebook_file;
127        }
128    }
129
130    /// Check if field should be built based on threshold
131    pub fn should_build_field(&self, field_id: u32, threshold: usize) -> bool {
132        // Don't build if already built
133        if self.is_field_built(field_id) {
134            return false;
135        }
136        // Build if we have enough vectors
137        self.total_vectors >= threshold
138    }
139
140    /// Add a segment
141    pub fn add_segment(&mut self, segment_id: String) {
142        if !self.segments.contains(&segment_id) {
143            self.segments.push(segment_id);
144        }
145    }
146
147    /// Remove segments
148    pub fn remove_segments(&mut self, to_remove: &[String]) {
149        self.segments.retain(|s| !to_remove.contains(s));
150    }
151
152    /// Load from directory
153    pub async fn load<D: crate::directories::Directory>(dir: &D) -> Result<Self> {
154        let path = Path::new(INDEX_META_FILENAME);
155        let slice = dir.open_read(path).await?;
156        let bytes = slice.read_bytes().await?;
157        serde_json::from_slice(bytes.as_slice()).map_err(|e| Error::Serialization(e.to_string()))
158    }
159
160    /// Save to directory
161    pub async fn save<D: crate::directories::DirectoryWriter>(&self, dir: &D) -> Result<()> {
162        let path = Path::new(INDEX_META_FILENAME);
163        let bytes =
164            serde_json::to_vec_pretty(self).map_err(|e| Error::Serialization(e.to_string()))?;
165        dir.write(path, &bytes).await.map_err(Error::Io)
166    }
167
168    /// Load trained centroids and codebooks from index-level files
169    ///
170    /// Returns (centroids_map, codebooks_map) for fields that are Built
171    pub async fn load_trained_structures<D: crate::directories::Directory>(
172        &self,
173        dir: &D,
174    ) -> (
175        rustc_hash::FxHashMap<u32, std::sync::Arc<crate::structures::CoarseCentroids>>,
176        rustc_hash::FxHashMap<u32, std::sync::Arc<crate::structures::PQCodebook>>,
177    ) {
178        use std::sync::Arc;
179
180        let mut centroids = rustc_hash::FxHashMap::default();
181        let mut codebooks = rustc_hash::FxHashMap::default();
182
183        for (field_id, field_meta) in &self.vector_fields {
184            if !matches!(field_meta.state, VectorIndexState::Built { .. }) {
185                continue;
186            }
187
188            // Load centroids
189            if let Some(ref file) = field_meta.centroids_file
190                && let Ok(slice) = dir.open_read(Path::new(file)).await
191                && let Ok(bytes) = slice.read_bytes().await
192                && let Ok(c) =
193                    serde_json::from_slice::<crate::structures::CoarseCentroids>(bytes.as_slice())
194            {
195                centroids.insert(*field_id, Arc::new(c));
196            }
197
198            // Load codebook (for ScaNN)
199            if let Some(ref file) = field_meta.codebook_file
200                && let Ok(slice) = dir.open_read(Path::new(file)).await
201                && let Ok(bytes) = slice.read_bytes().await
202                && let Ok(c) =
203                    serde_json::from_slice::<crate::structures::PQCodebook>(bytes.as_slice())
204            {
205                codebooks.insert(*field_id, Arc::new(c));
206            }
207        }
208
209        (centroids, codebooks)
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    fn test_schema() -> Schema {
218        Schema::default()
219    }
220
221    #[test]
222    fn test_metadata_init() {
223        let mut meta = IndexMetadata::new(test_schema());
224        assert_eq!(meta.total_vectors, 0);
225        assert!(meta.segments.is_empty());
226        assert!(!meta.is_field_built(0));
227
228        meta.init_field(0, VectorIndexType::IvfRaBitQ);
229        assert!(!meta.is_field_built(0));
230        assert!(meta.vector_fields.contains_key(&0));
231    }
232
233    #[test]
234    fn test_metadata_segments() {
235        let mut meta = IndexMetadata::new(test_schema());
236        meta.add_segment("abc123".to_string());
237        meta.add_segment("def456".to_string());
238        assert_eq!(meta.segments.len(), 2);
239
240        // No duplicates
241        meta.add_segment("abc123".to_string());
242        assert_eq!(meta.segments.len(), 2);
243
244        meta.remove_segments(&["abc123".to_string()]);
245        assert_eq!(meta.segments.len(), 1);
246        assert_eq!(meta.segments[0], "def456");
247    }
248
249    #[test]
250    fn test_mark_field_built() {
251        let mut meta = IndexMetadata::new(test_schema());
252        meta.init_field(0, VectorIndexType::IvfRaBitQ);
253        meta.total_vectors = 10000;
254
255        assert!(!meta.is_field_built(0));
256
257        meta.mark_field_built(0, 10000, 256, "field_0_centroids.bin".to_string(), None);
258
259        assert!(meta.is_field_built(0));
260        let field = meta.get_field_meta(0).unwrap();
261        assert_eq!(
262            field.centroids_file.as_deref(),
263            Some("field_0_centroids.bin")
264        );
265    }
266
267    #[test]
268    fn test_should_build_field() {
269        let mut meta = IndexMetadata::new(test_schema());
270        meta.init_field(0, VectorIndexType::IvfRaBitQ);
271
272        // Below threshold
273        meta.total_vectors = 500;
274        assert!(!meta.should_build_field(0, 1000));
275
276        // Above threshold
277        meta.total_vectors = 1500;
278        assert!(meta.should_build_field(0, 1000));
279
280        // Already built - should not build again
281        meta.mark_field_built(0, 1500, 256, "centroids.bin".to_string(), None);
282        assert!(!meta.should_build_field(0, 1000));
283    }
284
285    #[test]
286    fn test_serialization() {
287        let mut meta = IndexMetadata::new(test_schema());
288        meta.add_segment("seg1".to_string());
289        meta.init_field(0, VectorIndexType::IvfRaBitQ);
290        meta.total_vectors = 5000;
291
292        let json = serde_json::to_string_pretty(&meta).unwrap();
293        let loaded: IndexMetadata = serde_json::from_str(&json).unwrap();
294
295        assert_eq!(loaded.segments, meta.segments);
296        assert_eq!(loaded.total_vectors, meta.total_vectors);
297        assert!(loaded.vector_fields.contains_key(&0));
298    }
299}