1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
//! Collection creation constructors.
//!
//! Extracted from `lifecycle.rs` to reduce NLOC below the 500 threshold.
//! Contains all `create_*` public constructors and their shared helpers.
use crate::collection::types::{Collection, CollectionConfig, CURRENT_SCHEMA_VERSION};
use crate::distance::DistanceMetric;
use crate::error::Result;
use crate::quantization::StorageMode;
use crate::validation::validate_dimension;
use std::path::PathBuf;
impl Collection {
/// Creates a new collection at the specified path.
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config cannot be saved.
#[allow(dead_code)] // Reason: Called in velesql tests and test_fixtures
pub fn create(path: PathBuf, dimension: usize, metric: DistanceMetric) -> Result<Self> {
Self::create_with_options(path, dimension, metric, StorageMode::default())
}
/// Derives the collection name from the directory path.
fn name_from_path(path: &std::path::Path) -> String {
path.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown")
.to_string()
}
/// Shared init-and-persist pipeline for all `create_*` constructors.
///
/// Validates dimensions (when non-zero), creates the directory, assembles
/// the collection from the supplied config, and persists `config.json`.
pub(super) fn create_from_config(
path: PathBuf,
config: CollectionConfig,
hnsw_params: Option<crate::index::hnsw::HnswParams>,
) -> Result<Self> {
// dimension=0 is valid for metadata-only and graph-without-embedding
let skip_dimension_check = config.metadata_only
|| (config.graph_schema.is_some() && config.embedding_dimension.is_none());
if !skip_dimension_check {
validate_dimension(config.dimension)?;
}
std::fs::create_dir_all(&path)?;
let collection = Self::assemble(Self::init_collection_parts(path, config, hnsw_params)?);
collection.save_config()?;
Ok(collection)
}
/// Creates a new collection with custom storage options.
///
/// # Arguments
///
/// * `path` - Path to the collection directory
/// * `dimension` - Vector dimension
/// * `metric` - Distance metric
/// * `storage_mode` - Vector storage mode (Full, SQ8, Binary)
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config cannot be saved.
pub fn create_with_options(
path: PathBuf,
dimension: usize,
metric: DistanceMetric,
storage_mode: StorageMode,
) -> Result<Self> {
let config = CollectionConfig {
name: Self::name_from_path(&path),
dimension,
metric,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode,
metadata_only: false,
graph_schema: None,
embedding_dimension: None,
pq_rescore_oversampling: Some(4),
hnsw_params: None,
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: None,
};
Self::create_from_config(path, config, None)
}
/// Creates a new collection with custom HNSW parameters.
///
/// This is the lowest-level vector collection constructor, giving full
/// control over the HNSW graph topology (M, `ef_construction`) while
/// retaining the standard storage pipeline.
///
/// Uses the engine default `pq_rescore_oversampling = 4`. Callers that
/// need to override the PQ rescore factor should use
/// [`Collection::create_with_full_config`] instead.
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config cannot be saved.
#[allow(dead_code)] // Reason: kept as a convenience shortcut for lifecycle tests; the
// canonical constructor is `create_with_full_config`, which is used by all production
// call-sites since Wave 3 Commit 5. Deleting this wrapper would force every test to
// restate `Some(4)` for `pq_rescore_oversampling`, spreading the engine default across
// the test suite.
pub fn create_with_hnsw_params(
path: PathBuf,
dimension: usize,
metric: DistanceMetric,
storage_mode: StorageMode,
hnsw_params: crate::index::hnsw::HnswParams,
) -> Result<Self> {
Self::create_with_full_config(path, dimension, metric, storage_mode, hnsw_params, Some(4))
}
/// Creates a new collection with custom HNSW parameters and an explicit
/// PQ rescore oversampling factor.
///
/// This is the most expressive vector constructor: callers pass a fully
/// populated [`HnswParams`] (including `alpha`, `max_elements`, and any
/// future fields) together with an explicit `pq_rescore_oversampling`
/// override. It is the single underlying factory called by
/// [`Collection::create_with_hnsw_params`] (which pins the PQ factor to
/// its engine default of `Some(4)`).
///
/// Passing `pq_rescore_oversampling = None` keeps the on-disk config
/// in "no explicit override" mode, which allows later migrations to
/// recompute the factor from the dataset shape without having to
/// distinguish a persisted explicit value from a legacy default.
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config
/// cannot be saved.
pub fn create_with_full_config(
path: PathBuf,
dimension: usize,
metric: DistanceMetric,
storage_mode: StorageMode,
hnsw_params: crate::index::hnsw::HnswParams,
pq_rescore_oversampling: Option<u32>,
) -> Result<Self> {
let config = CollectionConfig {
name: Self::name_from_path(&path),
dimension,
metric,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode,
metadata_only: false,
graph_schema: None,
embedding_dimension: None,
pq_rescore_oversampling,
hnsw_params: Some(hnsw_params),
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: None,
};
Self::create_from_config(path, config, Some(hnsw_params))
}
/// Creates a new collection with `AsyncIndexBuilder` configuration.
///
/// When `async_index_builder` is `Some`, `upsert_bulk` uses the optimized
/// V2 path: `DirectVectorWriter` + `AsyncIndexBuilder` for higher throughput.
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config cannot be saved.
pub fn create_with_async_builder(
path: PathBuf,
dimension: usize,
metric: DistanceMetric,
async_builder_config: crate::collection::streaming::AsyncIndexBuilderConfig,
) -> Result<Self> {
let config = CollectionConfig {
name: Self::name_from_path(&path),
dimension,
metric,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode: StorageMode::Full,
metadata_only: false,
graph_schema: None,
embedding_dimension: None,
pq_rescore_oversampling: Some(4),
hnsw_params: None,
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: Some(async_builder_config),
};
Self::create_from_config(path, config, None)
}
/// Creates a new metadata-only collection (no vectors, no HNSW index).
///
/// Metadata-only collections are optimized for storing reference data,
/// catalogs, and other non-vector data. They support CRUD operations
/// and `VelesQL` queries on payload, but NOT vector search.
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config cannot be saved.
pub fn create_metadata_only(path: PathBuf, name: &str) -> Result<Self> {
let config = CollectionConfig {
name: name.to_string(),
dimension: 0,
metric: DistanceMetric::Cosine,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode: StorageMode::Full,
metadata_only: true,
graph_schema: None,
embedding_dimension: None,
pq_rescore_oversampling: Some(4),
hnsw_params: None,
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: None,
};
Self::create_from_config(path, config, None)
}
/// Creates a new graph collection (with optional node embeddings).
///
/// Persists `graph_schema` and `embedding_dimension` in `config.json`.
///
/// # Errors
///
/// Returns an error if the directory cannot be created or the config cannot be saved.
pub fn create_graph_collection(
path: PathBuf,
name: &str,
schema: crate::collection::graph::GraphSchema,
embedding_dim: Option<usize>,
metric: DistanceMetric,
) -> Result<Self> {
let config = CollectionConfig {
name: name.to_string(),
dimension: embedding_dim.unwrap_or(0),
metric,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode: StorageMode::Full,
metadata_only: false,
graph_schema: Some(schema),
embedding_dimension: embedding_dim,
pq_rescore_oversampling: Some(4),
hnsw_params: None,
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: None,
};
Self::create_from_config(path, config, None)
}
}