Skip to main content

ruvector_collections/
collection.rs

1//! Collection types and operations
2
3use ruvector_core::types::{DistanceMetric, HnswConfig, QuantizationConfig};
4use ruvector_core::vector_db::VectorDB;
5use serde::{Deserialize, Serialize};
6
7use crate::error::{CollectionError, Result};
8
9/// Configuration for creating a collection
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct CollectionConfig {
12    /// Vector dimensions
13    pub dimensions: usize,
14
15    /// Distance metric for similarity calculation
16    pub distance_metric: DistanceMetric,
17
18    /// HNSW index configuration
19    pub hnsw_config: Option<HnswConfig>,
20
21    /// Quantization configuration
22    pub quantization: Option<QuantizationConfig>,
23
24    /// Whether to store payload data on disk
25    pub on_disk_payload: bool,
26}
27
28impl CollectionConfig {
29    /// Validate the configuration
30    pub fn validate(&self) -> Result<()> {
31        if self.dimensions == 0 {
32            return Err(CollectionError::InvalidConfiguration {
33                message: "Dimensions must be greater than 0".to_string(),
34            });
35        }
36
37        if self.dimensions > 100_000 {
38            return Err(CollectionError::InvalidConfiguration {
39                message: "Dimensions exceeds maximum of 100,000".to_string(),
40            });
41        }
42
43        // Validate HNSW config if present
44        if let Some(ref hnsw_config) = self.hnsw_config {
45            if hnsw_config.m == 0 {
46                return Err(CollectionError::InvalidConfiguration {
47                    message: "HNSW M parameter must be greater than 0".to_string(),
48                });
49            }
50
51            if hnsw_config.ef_construction < hnsw_config.m {
52                return Err(CollectionError::InvalidConfiguration {
53                    message: "HNSW ef_construction must be >= M".to_string(),
54                });
55            }
56
57            if hnsw_config.ef_search == 0 {
58                return Err(CollectionError::InvalidConfiguration {
59                    message: "HNSW ef_search must be greater than 0".to_string(),
60                });
61            }
62        }
63
64        Ok(())
65    }
66
67    /// Create a default configuration for the given dimensions
68    pub fn with_dimensions(dimensions: usize) -> Self {
69        Self {
70            dimensions,
71            distance_metric: DistanceMetric::Cosine,
72            hnsw_config: Some(HnswConfig::default()),
73            quantization: Some(QuantizationConfig::Scalar),
74            on_disk_payload: true,
75        }
76    }
77}
78
79/// A collection of vectors with its own configuration
80pub struct Collection {
81    /// Collection name
82    pub name: String,
83
84    /// Collection configuration
85    pub config: CollectionConfig,
86
87    /// Underlying vector database
88    pub db: VectorDB,
89
90    /// When the collection was created (Unix timestamp in seconds)
91    pub created_at: i64,
92
93    /// When the collection was last updated (Unix timestamp in seconds)
94    pub updated_at: i64,
95}
96
97impl std::fmt::Debug for Collection {
98    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99        f.debug_struct("Collection")
100            .field("name", &self.name)
101            .field("config", &self.config)
102            .field("created_at", &self.created_at)
103            .field("updated_at", &self.updated_at)
104            .field("db", &"<VectorDB>")
105            .finish()
106    }
107}
108
109impl Collection {
110    /// Create a new collection
111    pub fn new(name: String, config: CollectionConfig, storage_path: String) -> Result<Self> {
112        // Validate configuration
113        config.validate()?;
114
115        // Create VectorDB with the configuration
116        let db_options = ruvector_core::types::DbOptions {
117            dimensions: config.dimensions,
118            distance_metric: config.distance_metric,
119            storage_path,
120            hnsw_config: config.hnsw_config.clone(),
121            quantization: config.quantization.clone(),
122        };
123
124        let db = VectorDB::new(db_options)?;
125
126        let now = std::time::SystemTime::now()
127            .duration_since(std::time::UNIX_EPOCH)
128            .unwrap()
129            .as_secs() as i64;
130
131        Ok(Self {
132            name,
133            config,
134            db,
135            created_at: now,
136            updated_at: now,
137        })
138    }
139
140    /// Get collection statistics
141    pub fn stats(&self) -> Result<CollectionStats> {
142        let vectors_count = self.db.len()?;
143
144        Ok(CollectionStats {
145            vectors_count,
146            segments_count: 1,  // Single segment for now
147            disk_size_bytes: 0, // TODO: Implement disk size calculation
148            ram_size_bytes: 0,  // TODO: Implement RAM size calculation
149        })
150    }
151
152    /// Update the last modified timestamp
153    pub fn touch(&mut self) {
154        self.updated_at = std::time::SystemTime::now()
155            .duration_since(std::time::UNIX_EPOCH)
156            .unwrap()
157            .as_secs() as i64;
158    }
159}
160
161/// Statistics about a collection
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct CollectionStats {
164    /// Number of vectors in the collection
165    pub vectors_count: usize,
166
167    /// Number of segments (partitions) in the collection
168    pub segments_count: usize,
169
170    /// Total disk space used (bytes)
171    pub disk_size_bytes: u64,
172
173    /// Total RAM used (bytes)
174    pub ram_size_bytes: u64,
175}
176
177impl CollectionStats {
178    /// Check if the collection is empty
179    pub fn is_empty(&self) -> bool {
180        self.vectors_count == 0
181    }
182
183    /// Get human-readable disk size
184    pub fn disk_size_human(&self) -> String {
185        format_bytes(self.disk_size_bytes)
186    }
187
188    /// Get human-readable RAM size
189    pub fn ram_size_human(&self) -> String {
190        format_bytes(self.ram_size_bytes)
191    }
192}
193
194/// Format bytes into human-readable size
195fn format_bytes(bytes: u64) -> String {
196    const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
197
198    if bytes == 0 {
199        return "0 B".to_string();
200    }
201
202    let mut size = bytes as f64;
203    let mut unit_idx = 0;
204
205    while size >= 1024.0 && unit_idx < UNITS.len() - 1 {
206        size /= 1024.0;
207        unit_idx += 1;
208    }
209
210    format!("{:.2} {}", size, UNITS[unit_idx])
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216    use ruvector_core::types::HnswConfig;
217
218    // ===== CollectionConfig validation tests =====
219
220    #[test]
221    fn test_collection_config_validation() {
222        // Valid config
223        let config = CollectionConfig::with_dimensions(384);
224        assert!(config.validate().is_ok());
225
226        // Invalid: zero dimensions
227        let config = CollectionConfig {
228            dimensions: 0,
229            distance_metric: DistanceMetric::Cosine,
230            hnsw_config: None,
231            quantization: None,
232            on_disk_payload: true,
233        };
234        assert!(config.validate().is_err());
235
236        // Invalid: dimensions too large
237        let config = CollectionConfig {
238            dimensions: 200_000,
239            distance_metric: DistanceMetric::Cosine,
240            hnsw_config: None,
241            quantization: None,
242            on_disk_payload: true,
243        };
244        assert!(config.validate().is_err());
245    }
246
247    #[test]
248    fn test_config_validates_at_boundary_dimensions() {
249        // Exactly 1 dimension -- minimum valid
250        let config = CollectionConfig {
251            dimensions: 1,
252            distance_metric: DistanceMetric::Cosine,
253            hnsw_config: None,
254            quantization: None,
255            on_disk_payload: false,
256        };
257        assert!(config.validate().is_ok());
258
259        // Exactly 100_000 -- maximum valid
260        let config = CollectionConfig {
261            dimensions: 100_000,
262            distance_metric: DistanceMetric::Cosine,
263            hnsw_config: None,
264            quantization: None,
265            on_disk_payload: false,
266        };
267        assert!(config.validate().is_ok());
268
269        // 100_001 -- just over the limit
270        let config = CollectionConfig {
271            dimensions: 100_001,
272            distance_metric: DistanceMetric::Cosine,
273            hnsw_config: None,
274            quantization: None,
275            on_disk_payload: false,
276        };
277        assert!(config.validate().is_err());
278    }
279
280    #[test]
281    fn test_config_validates_hnsw_m_zero() {
282        let config = CollectionConfig {
283            dimensions: 128,
284            distance_metric: DistanceMetric::Euclidean,
285            hnsw_config: Some(HnswConfig {
286                m: 0,
287                ef_construction: 200,
288                ef_search: 100,
289                max_elements: 1000,
290            }),
291            quantization: None,
292            on_disk_payload: false,
293        };
294        let err = config.validate().unwrap_err();
295        assert!(err.to_string().contains("M parameter"));
296    }
297
298    #[test]
299    fn test_config_validates_hnsw_ef_construction_less_than_m() {
300        let config = CollectionConfig {
301            dimensions: 128,
302            distance_metric: DistanceMetric::Cosine,
303            hnsw_config: Some(HnswConfig {
304                m: 32,
305                ef_construction: 16, // less than m
306                ef_search: 100,
307                max_elements: 1000,
308            }),
309            quantization: None,
310            on_disk_payload: false,
311        };
312        let err = config.validate().unwrap_err();
313        assert!(err.to_string().contains("ef_construction"));
314    }
315
316    #[test]
317    fn test_config_validates_hnsw_ef_search_zero() {
318        let config = CollectionConfig {
319            dimensions: 128,
320            distance_metric: DistanceMetric::Cosine,
321            hnsw_config: Some(HnswConfig {
322                m: 16,
323                ef_construction: 200,
324                ef_search: 0,
325                max_elements: 1000,
326            }),
327            quantization: None,
328            on_disk_payload: false,
329        };
330        let err = config.validate().unwrap_err();
331        assert!(err.to_string().contains("ef_search"));
332    }
333
334    #[test]
335    fn test_config_valid_hnsw_passes() {
336        let config = CollectionConfig {
337            dimensions: 64,
338            distance_metric: DistanceMetric::DotProduct,
339            hnsw_config: Some(HnswConfig {
340                m: 16,
341                ef_construction: 128,
342                ef_search: 50,
343                max_elements: 5000,
344            }),
345            quantization: None,
346            on_disk_payload: true,
347        };
348        assert!(config.validate().is_ok());
349    }
350
351    // ===== CollectionConfig::with_dimensions tests =====
352
353    #[test]
354    fn test_with_dimensions_sets_fields() {
355        let config = CollectionConfig::with_dimensions(256);
356        assert_eq!(config.dimensions, 256);
357        assert!(matches!(config.distance_metric, DistanceMetric::Cosine));
358        assert!(config.hnsw_config.is_some());
359        assert!(config.quantization.is_some());
360        assert!(config.on_disk_payload);
361    }
362
363    // ===== CollectionConfig serde tests =====
364
365    #[test]
366    fn test_config_serialization_roundtrip() {
367        let config = CollectionConfig::with_dimensions(384);
368        let json = serde_json::to_string(&config).expect("serialize");
369        let deserialized: CollectionConfig = serde_json::from_str(&json).expect("deserialize");
370        assert_eq!(deserialized.dimensions, 384);
371    }
372
373    // ===== Collection creation tests =====
374
375    #[test]
376    fn test_collection_new_with_valid_config() {
377        let temp = std::env::temp_dir().join("ruvector_test_coll_new_valid");
378        let _ = std::fs::remove_dir_all(&temp);
379        std::fs::create_dir_all(&temp).unwrap();
380
381        let db_path = temp.join("vectors.db").to_string_lossy().to_string();
382        let config = CollectionConfig::with_dimensions(64);
383        let coll = Collection::new("test_coll".to_string(), config, db_path);
384        assert!(coll.is_ok());
385
386        let coll = coll.unwrap();
387        assert_eq!(coll.name, "test_coll");
388        assert_eq!(coll.config.dimensions, 64);
389        assert!(coll.created_at > 0);
390        assert_eq!(coll.created_at, coll.updated_at);
391
392        let _ = std::fs::remove_dir_all(&temp);
393    }
394
395    #[test]
396    fn test_collection_new_rejects_zero_dimensions() {
397        let temp = std::env::temp_dir().join("ruvector_test_coll_new_zero");
398        let _ = std::fs::remove_dir_all(&temp);
399        std::fs::create_dir_all(&temp).unwrap();
400
401        let db_path = temp.join("vectors.db").to_string_lossy().to_string();
402        let config = CollectionConfig {
403            dimensions: 0,
404            distance_metric: DistanceMetric::Cosine,
405            hnsw_config: None,
406            quantization: None,
407            on_disk_payload: false,
408        };
409        let result = Collection::new("bad".to_string(), config, db_path);
410        assert!(result.is_err());
411
412        let _ = std::fs::remove_dir_all(&temp);
413    }
414
415    // ===== Collection stats tests =====
416
417    #[test]
418    fn test_collection_stats_on_empty() {
419        let temp = std::env::temp_dir().join("ruvector_test_coll_stats_empty");
420        let _ = std::fs::remove_dir_all(&temp);
421        std::fs::create_dir_all(&temp).unwrap();
422
423        let db_path = temp.join("vectors.db").to_string_lossy().to_string();
424        let config = CollectionConfig::with_dimensions(32);
425        let coll = Collection::new("stats_test".to_string(), config, db_path).unwrap();
426
427        let stats = coll.stats().unwrap();
428        assert_eq!(stats.vectors_count, 0);
429        assert!(stats.is_empty());
430
431        let _ = std::fs::remove_dir_all(&temp);
432    }
433
434    // ===== Collection touch tests =====
435
436    #[test]
437    fn test_collection_touch_updates_timestamp() {
438        let temp = std::env::temp_dir().join("ruvector_test_coll_touch");
439        let _ = std::fs::remove_dir_all(&temp);
440        std::fs::create_dir_all(&temp).unwrap();
441
442        let db_path = temp.join("vectors.db").to_string_lossy().to_string();
443        let config = CollectionConfig::with_dimensions(32);
444        let mut coll = Collection::new("touch_test".to_string(), config, db_path).unwrap();
445
446        let before = coll.updated_at;
447        // Touch with a small pause to ensure timestamp can differ
448        coll.touch();
449        assert!(coll.updated_at >= before);
450
451        let _ = std::fs::remove_dir_all(&temp);
452    }
453
454    // ===== Collection Debug impl test =====
455
456    #[test]
457    fn test_collection_debug_format() {
458        let temp = std::env::temp_dir().join("ruvector_test_coll_debug");
459        let _ = std::fs::remove_dir_all(&temp);
460        std::fs::create_dir_all(&temp).unwrap();
461
462        let db_path = temp.join("vectors.db").to_string_lossy().to_string();
463        let config = CollectionConfig::with_dimensions(16);
464        let coll = Collection::new("debug_test".to_string(), config, db_path).unwrap();
465
466        let debug_str = format!("{:?}", coll);
467        assert!(debug_str.contains("debug_test"));
468        assert!(debug_str.contains("<VectorDB>"));
469
470        let _ = std::fs::remove_dir_all(&temp);
471    }
472
473    // ===== CollectionStats tests =====
474
475    #[test]
476    fn test_collection_stats_is_empty() {
477        let stats = CollectionStats {
478            vectors_count: 0,
479            segments_count: 1,
480            disk_size_bytes: 0,
481            ram_size_bytes: 0,
482        };
483        assert!(stats.is_empty());
484
485        let stats = CollectionStats {
486            vectors_count: 5,
487            segments_count: 1,
488            disk_size_bytes: 1024,
489            ram_size_bytes: 512,
490        };
491        assert!(!stats.is_empty());
492    }
493
494    #[test]
495    fn test_collection_stats_human_readable_sizes() {
496        let stats = CollectionStats {
497            vectors_count: 100,
498            segments_count: 1,
499            disk_size_bytes: 1048576, // 1 MB
500            ram_size_bytes: 2048,     // 2 KB
501        };
502        assert_eq!(stats.disk_size_human(), "1.00 MB");
503        assert_eq!(stats.ram_size_human(), "2.00 KB");
504    }
505
506    #[test]
507    fn test_collection_stats_zero_bytes_human() {
508        let stats = CollectionStats {
509            vectors_count: 0,
510            segments_count: 0,
511            disk_size_bytes: 0,
512            ram_size_bytes: 0,
513        };
514        assert_eq!(stats.disk_size_human(), "0 B");
515        assert_eq!(stats.ram_size_human(), "0 B");
516    }
517
518    #[test]
519    fn test_collection_stats_serde_roundtrip() {
520        let stats = CollectionStats {
521            vectors_count: 42,
522            segments_count: 3,
523            disk_size_bytes: 999,
524            ram_size_bytes: 888,
525        };
526        let json = serde_json::to_string(&stats).unwrap();
527        let deserialized: CollectionStats = serde_json::from_str(&json).unwrap();
528        assert_eq!(deserialized.vectors_count, 42);
529        assert_eq!(deserialized.segments_count, 3);
530        assert_eq!(deserialized.disk_size_bytes, 999);
531        assert_eq!(deserialized.ram_size_bytes, 888);
532    }
533
534    // ===== format_bytes tests =====
535
536    #[test]
537    fn test_format_bytes() {
538        assert_eq!(format_bytes(0), "0 B");
539        assert_eq!(format_bytes(512), "512.00 B");
540        assert_eq!(format_bytes(1024), "1.00 KB");
541        assert_eq!(format_bytes(1536), "1.50 KB");
542        assert_eq!(format_bytes(1048576), "1.00 MB");
543        assert_eq!(format_bytes(1073741824), "1.00 GB");
544    }
545
546    #[test]
547    fn test_format_bytes_terabyte() {
548        assert_eq!(format_bytes(1099511627776), "1.00 TB");
549    }
550
551    #[test]
552    fn test_format_bytes_small_values() {
553        assert_eq!(format_bytes(1), "1.00 B");
554        assert_eq!(format_bytes(1023), "1023.00 B");
555    }
556
557    // ===== All distance metrics with valid config =====
558
559    #[test]
560    fn test_config_all_distance_metrics_validate() {
561        for metric in [
562            DistanceMetric::Cosine,
563            DistanceMetric::Euclidean,
564            DistanceMetric::DotProduct,
565            DistanceMetric::Manhattan,
566        ] {
567            let config = CollectionConfig {
568                dimensions: 128,
569                distance_metric: metric,
570                hnsw_config: None,
571                quantization: None,
572                on_disk_payload: false,
573            };
574            assert!(config.validate().is_ok(), "Failed for metric {:?}", metric);
575        }
576    }
577}