ruvector_collections/
collection.rs

1//! Collection types and operations
2
3use ruvector_core::types::{DistanceMetric, HnswConfig, QuantizationConfig};
4use ruvector_core::vector_db::VectorDB;
5use serde::{Deserialize, Serialize};
6
7use crate::error::{CollectionError, Result};
8
9/// Configuration for creating a collection
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct CollectionConfig {
12    /// Vector dimensions
13    pub dimensions: usize,
14
15    /// Distance metric for similarity calculation
16    pub distance_metric: DistanceMetric,
17
18    /// HNSW index configuration
19    pub hnsw_config: Option<HnswConfig>,
20
21    /// Quantization configuration
22    pub quantization: Option<QuantizationConfig>,
23
24    /// Whether to store payload data on disk
25    pub on_disk_payload: bool,
26}
27
28impl CollectionConfig {
29    /// Validate the configuration
30    pub fn validate(&self) -> Result<()> {
31        if self.dimensions == 0 {
32            return Err(CollectionError::InvalidConfiguration {
33                message: "Dimensions must be greater than 0".to_string(),
34            });
35        }
36
37        if self.dimensions > 100_000 {
38            return Err(CollectionError::InvalidConfiguration {
39                message: "Dimensions exceeds maximum of 100,000".to_string(),
40            });
41        }
42
43        // Validate HNSW config if present
44        if let Some(ref hnsw_config) = self.hnsw_config {
45            if hnsw_config.m == 0 {
46                return Err(CollectionError::InvalidConfiguration {
47                    message: "HNSW M parameter must be greater than 0".to_string(),
48                });
49            }
50
51            if hnsw_config.ef_construction < hnsw_config.m {
52                return Err(CollectionError::InvalidConfiguration {
53                    message: "HNSW ef_construction must be >= M".to_string(),
54                });
55            }
56
57            if hnsw_config.ef_search == 0 {
58                return Err(CollectionError::InvalidConfiguration {
59                    message: "HNSW ef_search must be greater than 0".to_string(),
60                });
61            }
62        }
63
64        Ok(())
65    }
66
67    /// Create a default configuration for the given dimensions
68    pub fn with_dimensions(dimensions: usize) -> Self {
69        Self {
70            dimensions,
71            distance_metric: DistanceMetric::Cosine,
72            hnsw_config: Some(HnswConfig::default()),
73            quantization: Some(QuantizationConfig::Scalar),
74            on_disk_payload: true,
75        }
76    }
77}
78
79/// A collection of vectors with its own configuration
80pub struct Collection {
81    /// Collection name
82    pub name: String,
83
84    /// Collection configuration
85    pub config: CollectionConfig,
86
87    /// Underlying vector database
88    pub db: VectorDB,
89
90    /// When the collection was created (Unix timestamp in seconds)
91    pub created_at: i64,
92
93    /// When the collection was last updated (Unix timestamp in seconds)
94    pub updated_at: i64,
95}
96
97impl std::fmt::Debug for Collection {
98    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99        f.debug_struct("Collection")
100            .field("name", &self.name)
101            .field("config", &self.config)
102            .field("created_at", &self.created_at)
103            .field("updated_at", &self.updated_at)
104            .field("db", &"<VectorDB>")
105            .finish()
106    }
107}
108
109impl Collection {
110    /// Create a new collection
111    pub fn new(name: String, config: CollectionConfig, storage_path: String) -> Result<Self> {
112        // Validate configuration
113        config.validate()?;
114
115        // Create VectorDB with the configuration
116        let db_options = ruvector_core::types::DbOptions {
117            dimensions: config.dimensions,
118            distance_metric: config.distance_metric,
119            storage_path,
120            hnsw_config: config.hnsw_config.clone(),
121            quantization: config.quantization.clone(),
122        };
123
124        let db = VectorDB::new(db_options)?;
125
126        let now = std::time::SystemTime::now()
127            .duration_since(std::time::UNIX_EPOCH)
128            .unwrap()
129            .as_secs() as i64;
130
131        Ok(Self {
132            name,
133            config,
134            db,
135            created_at: now,
136            updated_at: now,
137        })
138    }
139
140    /// Get collection statistics
141    pub fn stats(&self) -> Result<CollectionStats> {
142        let vectors_count = self.db.len()?;
143
144        Ok(CollectionStats {
145            vectors_count,
146            segments_count: 1,  // Single segment for now
147            disk_size_bytes: 0, // TODO: Implement disk size calculation
148            ram_size_bytes: 0,  // TODO: Implement RAM size calculation
149        })
150    }
151
152    /// Update the last modified timestamp
153    pub fn touch(&mut self) {
154        self.updated_at = std::time::SystemTime::now()
155            .duration_since(std::time::UNIX_EPOCH)
156            .unwrap()
157            .as_secs() as i64;
158    }
159}
160
161/// Statistics about a collection
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct CollectionStats {
164    /// Number of vectors in the collection
165    pub vectors_count: usize,
166
167    /// Number of segments (partitions) in the collection
168    pub segments_count: usize,
169
170    /// Total disk space used (bytes)
171    pub disk_size_bytes: u64,
172
173    /// Total RAM used (bytes)
174    pub ram_size_bytes: u64,
175}
176
177impl CollectionStats {
178    /// Check if the collection is empty
179    pub fn is_empty(&self) -> bool {
180        self.vectors_count == 0
181    }
182
183    /// Get human-readable disk size
184    pub fn disk_size_human(&self) -> String {
185        format_bytes(self.disk_size_bytes)
186    }
187
188    /// Get human-readable RAM size
189    pub fn ram_size_human(&self) -> String {
190        format_bytes(self.ram_size_bytes)
191    }
192}
193
194/// Format bytes into human-readable size
195fn format_bytes(bytes: u64) -> String {
196    const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
197
198    if bytes == 0 {
199        return "0 B".to_string();
200    }
201
202    let mut size = bytes as f64;
203    let mut unit_idx = 0;
204
205    while size >= 1024.0 && unit_idx < UNITS.len() - 1 {
206        size /= 1024.0;
207        unit_idx += 1;
208    }
209
210    format!("{:.2} {}", size, UNITS[unit_idx])
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn test_collection_config_validation() {
219        // Valid config
220        let config = CollectionConfig::with_dimensions(384);
221        assert!(config.validate().is_ok());
222
223        // Invalid: zero dimensions
224        let config = CollectionConfig {
225            dimensions: 0,
226            distance_metric: DistanceMetric::Cosine,
227            hnsw_config: None,
228            quantization: None,
229            on_disk_payload: true,
230        };
231        assert!(config.validate().is_err());
232
233        // Invalid: dimensions too large
234        let config = CollectionConfig {
235            dimensions: 200_000,
236            distance_metric: DistanceMetric::Cosine,
237            hnsw_config: None,
238            quantization: None,
239            on_disk_payload: true,
240        };
241        assert!(config.validate().is_err());
242    }
243
244    #[test]
245    fn test_format_bytes() {
246        assert_eq!(format_bytes(0), "0 B");
247        assert_eq!(format_bytes(512), "512.00 B");
248        assert_eq!(format_bytes(1024), "1.00 KB");
249        assert_eq!(format_bytes(1536), "1.50 KB");
250        assert_eq!(format_bytes(1048576), "1.00 MB");
251        assert_eq!(format_bytes(1073741824), "1.00 GB");
252    }
253}