1use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::path::Path;
16
17use crate::dsl::VectorIndexType;
18use crate::error::{Error, Result};
19use crate::schema::Schema;
20
21pub const INDEX_META_FILENAME: &str = "metadata.json";
23
24#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
26pub enum VectorIndexState {
27 #[default]
29 Flat,
30 Built {
32 vector_count: usize,
34 num_clusters: usize,
36 },
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct SegmentMetaInfo {
43 pub num_docs: u32,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct FieldVectorMeta {
50 pub field_id: u32,
52 pub index_type: VectorIndexType,
54 pub state: VectorIndexState,
56 #[serde(skip_serializing_if = "Option::is_none")]
58 pub centroids_file: Option<String>,
59 #[serde(skip_serializing_if = "Option::is_none")]
61 pub codebook_file: Option<String>,
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct IndexMetadata {
67 pub version: u32,
69 pub schema: Schema,
71 #[serde(default)]
74 pub segment_metas: HashMap<String, SegmentMetaInfo>,
75 #[serde(default)]
77 pub vector_fields: HashMap<u32, FieldVectorMeta>,
78 #[serde(default)]
80 pub total_vectors: usize,
81}
82
83impl IndexMetadata {
84 pub fn new(schema: Schema) -> Self {
86 Self {
87 version: 1,
88 schema,
89 segment_metas: HashMap::new(),
90 vector_fields: HashMap::new(),
91 total_vectors: 0,
92 }
93 }
94
95 pub fn segment_ids(&self) -> Vec<String> {
97 let mut ids: Vec<String> = self.segment_metas.keys().cloned().collect();
98 ids.sort();
99 ids
100 }
101
102 pub fn add_segment(&mut self, segment_id: String, num_docs: u32) {
104 self.segment_metas
105 .insert(segment_id, SegmentMetaInfo { num_docs });
106 }
107
108 pub fn remove_segment(&mut self, segment_id: &str) {
110 self.segment_metas.remove(segment_id);
111 }
112
113 pub fn has_segment(&self, segment_id: &str) -> bool {
115 self.segment_metas.contains_key(segment_id)
116 }
117
118 pub fn segment_doc_count(&self, segment_id: &str) -> Option<u32> {
120 self.segment_metas.get(segment_id).map(|m| m.num_docs)
121 }
122
123 pub fn is_field_built(&self, field_id: u32) -> bool {
125 self.vector_fields
126 .get(&field_id)
127 .map(|f| matches!(f.state, VectorIndexState::Built { .. }))
128 .unwrap_or(false)
129 }
130
131 pub fn get_field_meta(&self, field_id: u32) -> Option<&FieldVectorMeta> {
133 self.vector_fields.get(&field_id)
134 }
135
136 pub fn init_field(&mut self, field_id: u32, index_type: VectorIndexType) {
138 self.vector_fields
139 .entry(field_id)
140 .or_insert(FieldVectorMeta {
141 field_id,
142 index_type,
143 state: VectorIndexState::Flat,
144 centroids_file: None,
145 codebook_file: None,
146 });
147 }
148
149 pub fn mark_field_built(
151 &mut self,
152 field_id: u32,
153 vector_count: usize,
154 num_clusters: usize,
155 centroids_file: String,
156 codebook_file: Option<String>,
157 ) {
158 if let Some(field) = self.vector_fields.get_mut(&field_id) {
159 field.state = VectorIndexState::Built {
160 vector_count,
161 num_clusters,
162 };
163 field.centroids_file = Some(centroids_file);
164 field.codebook_file = codebook_file;
165 }
166 }
167
168 pub fn should_build_field(&self, field_id: u32, threshold: usize) -> bool {
170 if self.is_field_built(field_id) {
172 return false;
173 }
174 self.total_vectors >= threshold
176 }
177
178 pub async fn load<D: crate::directories::Directory>(dir: &D) -> Result<Self> {
180 let path = Path::new(INDEX_META_FILENAME);
181 let slice = dir.open_read(path).await?;
182 let bytes = slice.read_bytes().await?;
183 serde_json::from_slice(bytes.as_slice()).map_err(|e| Error::Serialization(e.to_string()))
184 }
185
186 pub async fn save<D: crate::directories::DirectoryWriter>(&self, dir: &D) -> Result<()> {
188 let path = Path::new(INDEX_META_FILENAME);
189 let bytes =
190 serde_json::to_vec_pretty(self).map_err(|e| Error::Serialization(e.to_string()))?;
191 dir.write(path, &bytes).await.map_err(Error::Io)
192 }
193
194 pub async fn load_trained_structures<D: crate::directories::Directory>(
198 &self,
199 dir: &D,
200 ) -> (
201 rustc_hash::FxHashMap<u32, std::sync::Arc<crate::structures::CoarseCentroids>>,
202 rustc_hash::FxHashMap<u32, std::sync::Arc<crate::structures::PQCodebook>>,
203 ) {
204 use std::sync::Arc;
205
206 let mut centroids = rustc_hash::FxHashMap::default();
207 let mut codebooks = rustc_hash::FxHashMap::default();
208
209 for (field_id, field_meta) in &self.vector_fields {
210 if !matches!(field_meta.state, VectorIndexState::Built { .. }) {
211 continue;
212 }
213
214 if let Some(ref file) = field_meta.centroids_file
216 && let Ok(slice) = dir.open_read(Path::new(file)).await
217 && let Ok(bytes) = slice.read_bytes().await
218 && let Ok(c) =
219 serde_json::from_slice::<crate::structures::CoarseCentroids>(bytes.as_slice())
220 {
221 centroids.insert(*field_id, Arc::new(c));
222 }
223
224 if let Some(ref file) = field_meta.codebook_file
226 && let Ok(slice) = dir.open_read(Path::new(file)).await
227 && let Ok(bytes) = slice.read_bytes().await
228 && let Ok(c) =
229 serde_json::from_slice::<crate::structures::PQCodebook>(bytes.as_slice())
230 {
231 codebooks.insert(*field_id, Arc::new(c));
232 }
233 }
234
235 (centroids, codebooks)
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use super::*;
242
243 fn test_schema() -> Schema {
244 Schema::default()
245 }
246
247 #[test]
248 fn test_metadata_init() {
249 let mut meta = IndexMetadata::new(test_schema());
250 assert_eq!(meta.total_vectors, 0);
251 assert!(meta.segment_metas.is_empty());
252 assert!(!meta.is_field_built(0));
253
254 meta.init_field(0, VectorIndexType::IvfRaBitQ);
255 assert!(!meta.is_field_built(0));
256 assert!(meta.vector_fields.contains_key(&0));
257 }
258
259 #[test]
260 fn test_metadata_segments() {
261 let mut meta = IndexMetadata::new(test_schema());
262 meta.add_segment("abc123".to_string(), 50);
263 meta.add_segment("def456".to_string(), 100);
264 assert_eq!(meta.segment_metas.len(), 2);
265 assert_eq!(meta.segment_doc_count("abc123"), Some(50));
266 assert_eq!(meta.segment_doc_count("def456"), Some(100));
267
268 meta.add_segment("abc123".to_string(), 75);
270 assert_eq!(meta.segment_metas.len(), 2);
271 assert_eq!(meta.segment_doc_count("abc123"), Some(75));
272
273 meta.remove_segment("abc123");
274 assert_eq!(meta.segment_metas.len(), 1);
275 assert!(meta.has_segment("def456"));
276 assert!(!meta.has_segment("abc123"));
277 }
278
279 #[test]
280 fn test_mark_field_built() {
281 let mut meta = IndexMetadata::new(test_schema());
282 meta.init_field(0, VectorIndexType::IvfRaBitQ);
283 meta.total_vectors = 10000;
284
285 assert!(!meta.is_field_built(0));
286
287 meta.mark_field_built(0, 10000, 256, "field_0_centroids.bin".to_string(), None);
288
289 assert!(meta.is_field_built(0));
290 let field = meta.get_field_meta(0).unwrap();
291 assert_eq!(
292 field.centroids_file.as_deref(),
293 Some("field_0_centroids.bin")
294 );
295 }
296
297 #[test]
298 fn test_should_build_field() {
299 let mut meta = IndexMetadata::new(test_schema());
300 meta.init_field(0, VectorIndexType::IvfRaBitQ);
301
302 meta.total_vectors = 500;
304 assert!(!meta.should_build_field(0, 1000));
305
306 meta.total_vectors = 1500;
308 assert!(meta.should_build_field(0, 1000));
309
310 meta.mark_field_built(0, 1500, 256, "centroids.bin".to_string(), None);
312 assert!(!meta.should_build_field(0, 1000));
313 }
314
315 #[test]
316 fn test_serialization() {
317 let mut meta = IndexMetadata::new(test_schema());
318 meta.add_segment("seg1".to_string(), 100);
319 meta.init_field(0, VectorIndexType::IvfRaBitQ);
320 meta.total_vectors = 5000;
321
322 let json = serde_json::to_string_pretty(&meta).unwrap();
323 let loaded: IndexMetadata = serde_json::from_str(&json).unwrap();
324
325 assert_eq!(loaded.segment_ids().len(), meta.segment_ids().len());
326 assert_eq!(loaded.segment_doc_count("seg1"), Some(100));
327 assert_eq!(loaded.total_vectors, meta.total_vectors);
328 assert!(loaded.vector_fields.contains_key(&0));
329 }
330}