chroma_types/
collection_schema.rs

1use chroma_error::{ChromaError, ErrorCodes};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use thiserror::Error;
5
6use crate::collection_configuration::{
7    EmbeddingFunctionConfiguration, InternalCollectionConfiguration, VectorIndexConfiguration,
8};
9use crate::hnsw_configuration::Space;
10use crate::metadata::{MetadataComparison, MetadataValueType, Where};
11use crate::operator::QueryVector;
12use crate::{
13    default_batch_size, default_construction_ef, default_construction_ef_spann,
14    default_initial_lambda, default_m, default_m_spann, default_merge_threshold,
15    default_nreplica_count, default_num_centers_to_merge_to, default_num_samples_kmeans,
16    default_num_threads, default_reassign_neighbor_count, default_resize_factor, default_search_ef,
17    default_search_ef_spann, default_search_nprobe, default_search_rng_epsilon,
18    default_search_rng_factor, default_space, default_split_threshold, default_sync_threshold,
19    default_write_nprobe, default_write_rng_epsilon, default_write_rng_factor,
20    InternalSpannConfiguration, KnnIndex,
21};
22
23impl ChromaError for SchemaError {
24    fn code(&self) -> ErrorCodes {
25        ErrorCodes::Internal
26    }
27}
28
29#[derive(Debug, Error)]
30pub enum SchemaError {
31    #[error("Schema is malformed: missing index configuration for metadata key '{key}' with type '{value_type}'")]
32    MissingIndexConfiguration { key: String, value_type: String },
33    #[error("Schema reconciliation failed: {reason}")]
34    InvalidSchema { reason: String },
35}
36
37#[derive(Debug, Error)]
38pub enum FilterValidationError {
39    #[error(
40        "Cannot filter using metadata key '{key}' with type '{value_type:?}' because indexing is disabled"
41    )]
42    IndexingDisabled {
43        key: String,
44        value_type: MetadataValueType,
45    },
46    #[error(transparent)]
47    Schema(#[from] SchemaError),
48}
49
50impl ChromaError for FilterValidationError {
51    fn code(&self) -> ErrorCodes {
52        match self {
53            FilterValidationError::IndexingDisabled { .. } => ErrorCodes::InvalidArgument,
54            FilterValidationError::Schema(_) => ErrorCodes::Internal,
55        }
56    }
57}
58
59// ============================================================================
60// SCHEMA CONSTANTS
61// ============================================================================
62// These constants must match the Python constants in chromadb/api/types.py
63
64// Value type name constants
65pub const STRING_VALUE_NAME: &str = "string";
66pub const INT_VALUE_NAME: &str = "int";
67pub const BOOL_VALUE_NAME: &str = "bool";
68pub const FLOAT_VALUE_NAME: &str = "float";
69pub const FLOAT_LIST_VALUE_NAME: &str = "float_list";
70pub const SPARSE_VECTOR_VALUE_NAME: &str = "sparse_vector";
71
72// Index type name constants
73pub const FTS_INDEX_NAME: &str = "fts_index";
74pub const VECTOR_INDEX_NAME: &str = "vector_index";
75pub const SPARSE_VECTOR_INDEX_NAME: &str = "sparse_vector_index";
76pub const STRING_INVERTED_INDEX_NAME: &str = "string_inverted_index";
77pub const INT_INVERTED_INDEX_NAME: &str = "int_inverted_index";
78pub const FLOAT_INVERTED_INDEX_NAME: &str = "float_inverted_index";
79pub const BOOL_INVERTED_INDEX_NAME: &str = "bool_inverted_index";
80
81// Special metadata keys - must match Python constants in chromadb/api/types.py
82pub const DOCUMENT_KEY: &str = "#document";
83pub const EMBEDDING_KEY: &str = "#embedding";
84
85// ============================================================================
86// SCHEMA STRUCTURES
87// ============================================================================
88
89/// Internal schema representation for collection index configurations
90/// This represents the server-side schema structure used for index management
91
92#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
93#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
94pub struct InternalSchema {
95    /// Default index configurations for each value type
96    pub defaults: ValueTypes,
97    /// Key-specific index overrides
98    /// TODO(Sanket): Needed for backwards compatibility. Should remove after deploy.
99    #[serde(rename = "keys", alias = "key_overrides")]
100    pub keys: HashMap<String, ValueTypes>,
101}
102
103pub fn is_embedding_function_default(
104    embedding_function: &Option<EmbeddingFunctionConfiguration>,
105) -> bool {
106    match embedding_function {
107        None => true,
108        Some(embedding_function) => embedding_function.is_default(),
109    }
110}
111
112/// Check if space is default (None means default, or if present, should be default space)
113pub fn is_space_default(space: &Option<Space>) -> bool {
114    match space {
115        None => true,                     // None means default
116        Some(s) => *s == default_space(), // If present, check if it's the default space
117    }
118}
119
120/// Check if HNSW config is default
121pub fn is_hnsw_config_default(hnsw_config: &HnswIndexConfig) -> bool {
122    hnsw_config.ef_construction == Some(default_construction_ef())
123        && hnsw_config.ef_search == Some(default_search_ef())
124        && hnsw_config.max_neighbors == Some(default_m())
125        && hnsw_config.num_threads == Some(default_num_threads())
126        && hnsw_config.batch_size == Some(default_batch_size())
127        && hnsw_config.sync_threshold == Some(default_sync_threshold())
128        && hnsw_config.resize_factor == Some(default_resize_factor())
129}
130
131// ============================================================================
132// NEW STRONGLY-TYPED SCHEMA STRUCTURES
133// ============================================================================
134
135/// Strongly-typed value type configurations
136/// Contains optional configurations for each supported value type
137#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)]
138#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
139pub struct ValueTypes {
140    #[serde(
141        rename = "string",
142        alias = "#string",
143        skip_serializing_if = "Option::is_none"
144    )] // STRING_VALUE_NAME
145    pub string: Option<StringValueType>,
146
147    #[serde(
148        rename = "float_list",
149        alias = "#float_list",
150        skip_serializing_if = "Option::is_none"
151    )]
152    // FLOAT_LIST_VALUE_NAME
153    pub float_list: Option<FloatListValueType>,
154
155    #[serde(
156        rename = "sparse_vector",
157        alias = "#sparse_vector",
158        skip_serializing_if = "Option::is_none"
159    )]
160    // SPARSE_VECTOR_VALUE_NAME
161    pub sparse_vector: Option<SparseVectorValueType>,
162
163    #[serde(
164        rename = "int",
165        alias = "#int",
166        skip_serializing_if = "Option::is_none"
167    )] // INT_VALUE_NAME
168    pub int: Option<IntValueType>,
169
170    #[serde(
171        rename = "float",
172        alias = "#float",
173        skip_serializing_if = "Option::is_none"
174    )] // FLOAT_VALUE_NAME
175    pub float: Option<FloatValueType>,
176
177    #[serde(
178        rename = "bool",
179        alias = "#bool",
180        skip_serializing_if = "Option::is_none"
181    )] // BOOL_VALUE_NAME
182    pub boolean: Option<BoolValueType>,
183}
184
185/// String value type index configurations
186#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
187#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
188pub struct StringValueType {
189    #[serde(
190        rename = "fts_index",
191        alias = "$fts_index",
192        skip_serializing_if = "Option::is_none"
193    )] // FTS_INDEX_NAME
194    pub fts_index: Option<FtsIndexType>,
195
196    #[serde(
197        rename = "string_inverted_index", // STRING_INVERTED_INDEX_NAME
198        alias = "$string_inverted_index",
199        skip_serializing_if = "Option::is_none"
200    )]
201    pub string_inverted_index: Option<StringInvertedIndexType>,
202}
203
204/// Float list value type index configurations (for vectors)
205#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
206#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
207pub struct FloatListValueType {
208    #[serde(
209        rename = "vector_index",
210        alias = "$vector_index",
211        skip_serializing_if = "Option::is_none"
212    )] // VECTOR_INDEX_NAME
213    pub vector_index: Option<VectorIndexType>,
214}
215
216/// Sparse vector value type index configurations
217#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
218#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
219pub struct SparseVectorValueType {
220    #[serde(
221        rename = "sparse_vector_index", // SPARSE_VECTOR_INDEX_NAME
222        alias = "$sparse_vector_index",
223        skip_serializing_if = "Option::is_none"
224    )]
225    pub sparse_vector_index: Option<SparseVectorIndexType>,
226}
227
228/// Integer value type index configurations
229#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
230#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
231pub struct IntValueType {
232    #[serde(
233        rename = "int_inverted_index",
234        alias = "$int_inverted_index",
235        skip_serializing_if = "Option::is_none"
236    )]
237    // INT_INVERTED_INDEX_NAME
238    pub int_inverted_index: Option<IntInvertedIndexType>,
239}
240
241/// Float value type index configurations
242#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
243#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
244pub struct FloatValueType {
245    #[serde(
246        rename = "float_inverted_index", // FLOAT_INVERTED_INDEX_NAME
247        alias = "$float_inverted_index",
248        skip_serializing_if = "Option::is_none"
249    )]
250    pub float_inverted_index: Option<FloatInvertedIndexType>,
251}
252
253/// Boolean value type index configurations
254#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
255#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
256pub struct BoolValueType {
257    #[serde(
258        rename = "bool_inverted_index", // BOOL_INVERTED_INDEX_NAME
259        alias = "$bool_inverted_index",
260        skip_serializing_if = "Option::is_none"
261    )]
262    pub bool_inverted_index: Option<BoolInvertedIndexType>,
263}
264
265// Individual index type structs with enabled status and config
266#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
267#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
268pub struct FtsIndexType {
269    pub enabled: bool,
270    pub config: FtsIndexConfig,
271}
272
273#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
274#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
275pub struct VectorIndexType {
276    pub enabled: bool,
277    pub config: VectorIndexConfig,
278}
279
280#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
281#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
282pub struct SparseVectorIndexType {
283    pub enabled: bool,
284    pub config: SparseVectorIndexConfig,
285}
286
287#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
288#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
289pub struct StringInvertedIndexType {
290    pub enabled: bool,
291    pub config: StringInvertedIndexConfig,
292}
293
294#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
295#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
296pub struct IntInvertedIndexType {
297    pub enabled: bool,
298    pub config: IntInvertedIndexConfig,
299}
300
301#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
302#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
303pub struct FloatInvertedIndexType {
304    pub enabled: bool,
305    pub config: FloatInvertedIndexConfig,
306}
307
308#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
309#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
310pub struct BoolInvertedIndexType {
311    pub enabled: bool,
312    pub config: BoolInvertedIndexConfig,
313}
314
315impl InternalSchema {
316    /// Create a new InternalSchema with strongly-typed default configurations
317    pub fn new_default(default_knn_index: KnnIndex) -> Self {
318        // Vector index disabled on all keys except #embedding.
319        let vector_config = VectorIndexType {
320            enabled: false,
321            config: VectorIndexConfig {
322                space: Some(default_space()),
323                embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
324                source_key: None,
325                hnsw: match default_knn_index {
326                    KnnIndex::Hnsw => Some(HnswIndexConfig {
327                        ef_construction: Some(default_construction_ef()),
328                        max_neighbors: Some(default_m()),
329                        ef_search: Some(default_search_ef()),
330                        num_threads: Some(default_num_threads()),
331                        batch_size: Some(default_batch_size()),
332                        sync_threshold: Some(default_sync_threshold()),
333                        resize_factor: Some(default_resize_factor()),
334                    }),
335                    KnnIndex::Spann => None,
336                },
337                spann: match default_knn_index {
338                    KnnIndex::Hnsw => None,
339                    KnnIndex::Spann => Some(SpannIndexConfig {
340                        search_nprobe: Some(default_search_nprobe()),
341                        search_rng_factor: Some(default_search_rng_factor()),
342                        search_rng_epsilon: Some(default_search_rng_epsilon()),
343                        nreplica_count: Some(default_nreplica_count()),
344                        write_rng_factor: Some(default_write_rng_factor()),
345                        write_rng_epsilon: Some(default_write_rng_epsilon()),
346                        split_threshold: Some(default_split_threshold()),
347                        num_samples_kmeans: Some(default_num_samples_kmeans()),
348                        initial_lambda: Some(default_initial_lambda()),
349                        reassign_neighbor_count: Some(default_reassign_neighbor_count()),
350                        merge_threshold: Some(default_merge_threshold()),
351                        num_centers_to_merge_to: Some(default_num_centers_to_merge_to()),
352                        write_nprobe: Some(default_write_nprobe()),
353                        ef_construction: Some(default_construction_ef_spann()),
354                        ef_search: Some(default_search_ef_spann()),
355                        max_neighbors: Some(default_m_spann()),
356                    }),
357                },
358            },
359        };
360
361        // Initialize defaults struct directly instead of using Default::default() + field assignments
362        let defaults = ValueTypes {
363            string: Some(StringValueType {
364                string_inverted_index: Some(StringInvertedIndexType {
365                    enabled: true,
366                    config: StringInvertedIndexConfig {},
367                }),
368                fts_index: Some(FtsIndexType {
369                    enabled: false,
370                    config: FtsIndexConfig {},
371                }),
372            }),
373            float: Some(FloatValueType {
374                float_inverted_index: Some(FloatInvertedIndexType {
375                    enabled: true,
376                    config: FloatInvertedIndexConfig {},
377                }),
378            }),
379            int: Some(IntValueType {
380                int_inverted_index: Some(IntInvertedIndexType {
381                    enabled: true,
382                    config: IntInvertedIndexConfig {},
383                }),
384            }),
385            boolean: Some(BoolValueType {
386                bool_inverted_index: Some(BoolInvertedIndexType {
387                    enabled: true,
388                    config: BoolInvertedIndexConfig {},
389                }),
390            }),
391            float_list: Some(FloatListValueType {
392                vector_index: Some(vector_config),
393            }),
394            sparse_vector: Some(SparseVectorValueType {
395                sparse_vector_index: Some(SparseVectorIndexType {
396                    enabled: false,
397                    config: SparseVectorIndexConfig {
398                        embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
399                        source_key: None,
400                        bm25: Some(false),
401                    },
402                }),
403            }),
404        };
405
406        // Set up key overrides
407        let mut keys = HashMap::new();
408
409        // Enable vector index for #embedding.
410        let embedding_defaults = ValueTypes {
411            float_list: Some(FloatListValueType {
412                vector_index: Some(VectorIndexType {
413                    enabled: true,
414                    config: VectorIndexConfig {
415                        space: Some(default_space()),
416                        embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
417                        source_key: Some(DOCUMENT_KEY.to_string()),
418                        hnsw: match default_knn_index {
419                            KnnIndex::Hnsw => Some(HnswIndexConfig {
420                                ef_construction: Some(default_construction_ef()),
421                                max_neighbors: Some(default_m()),
422                                ef_search: Some(default_search_ef()),
423                                num_threads: Some(default_num_threads()),
424                                batch_size: Some(default_batch_size()),
425                                sync_threshold: Some(default_sync_threshold()),
426                                resize_factor: Some(default_resize_factor()),
427                            }),
428                            KnnIndex::Spann => None,
429                        },
430                        spann: match default_knn_index {
431                            KnnIndex::Hnsw => None,
432                            KnnIndex::Spann => Some(SpannIndexConfig {
433                                search_nprobe: Some(default_search_nprobe()),
434                                search_rng_factor: Some(default_search_rng_factor()),
435                                search_rng_epsilon: Some(default_search_rng_epsilon()),
436                                nreplica_count: Some(default_nreplica_count()),
437                                write_rng_factor: Some(default_write_rng_factor()),
438                                write_rng_epsilon: Some(default_write_rng_epsilon()),
439                                split_threshold: Some(default_split_threshold()),
440                                num_samples_kmeans: Some(default_num_samples_kmeans()),
441                                initial_lambda: Some(default_initial_lambda()),
442                                reassign_neighbor_count: Some(default_reassign_neighbor_count()),
443                                merge_threshold: Some(default_merge_threshold()),
444                                num_centers_to_merge_to: Some(default_num_centers_to_merge_to()),
445                                write_nprobe: Some(default_write_nprobe()),
446                                ef_construction: Some(default_construction_ef_spann()),
447                                ef_search: Some(default_search_ef_spann()),
448                                max_neighbors: Some(default_m_spann()),
449                            }),
450                        },
451                    },
452                }),
453            }),
454            ..Default::default()
455        };
456        keys.insert(EMBEDDING_KEY.to_string(), embedding_defaults);
457
458        // Document defaults - initialize directly instead of Default::default() + field assignment
459        let document_defaults = ValueTypes {
460            string: Some(StringValueType {
461                fts_index: Some(FtsIndexType {
462                    enabled: true,
463                    config: FtsIndexConfig {},
464                }),
465                string_inverted_index: Some(StringInvertedIndexType {
466                    enabled: false,
467                    config: StringInvertedIndexConfig {},
468                }),
469            }),
470            ..Default::default()
471        };
472        keys.insert(DOCUMENT_KEY.to_string(), document_defaults);
473
474        InternalSchema { defaults, keys }
475    }
476
477    pub fn get_internal_spann_config(&self) -> Option<InternalSpannConfiguration> {
478        let to_internal = |vector_index: &VectorIndexType| {
479            let space = vector_index.config.space.clone();
480            vector_index
481                .config
482                .spann
483                .clone()
484                .map(|config| config.into_internal_configuration(space))
485        };
486
487        self.keys
488            .get(EMBEDDING_KEY)
489            .and_then(|value_types| value_types.float_list.as_ref())
490            .and_then(|float_list| float_list.vector_index.as_ref())
491            .and_then(to_internal)
492            .or_else(|| {
493                self.defaults
494                    .float_list
495                    .as_ref()
496                    .and_then(|float_list| float_list.vector_index.as_ref())
497                    .and_then(to_internal)
498            })
499    }
500
501    /// Reconcile user-provided schema with system defaults
502    ///
503    /// This method merges user configurations with system defaults, ensuring that:
504    /// - User overrides take precedence over defaults
505    /// - Missing user configurations fall back to system defaults
506    /// - Field-level merging for complex configurations (Vector, HNSW, SPANN, etc.)
507    pub fn reconcile_with_defaults(user_schema: Option<InternalSchema>) -> Result<Self, String> {
508        let default_schema = InternalSchema::new_default(KnnIndex::Spann);
509
510        match user_schema {
511            Some(user) => {
512                // Merge defaults with user overrides
513                let merged_defaults =
514                    Self::merge_value_types(&default_schema.defaults, &user.defaults)?;
515
516                // Merge key overrides
517                let mut merged_keys = default_schema.keys.clone();
518                for (key, user_value_types) in user.keys {
519                    if let Some(default_value_types) = merged_keys.get(&key) {
520                        // Merge with existing default key override
521                        let merged_value_types =
522                            Self::merge_value_types(default_value_types, &user_value_types)?;
523                        merged_keys.insert(key, merged_value_types);
524                    } else {
525                        // New key override from user
526                        merged_keys.insert(key, user_value_types);
527                    }
528                }
529
530                Ok(InternalSchema {
531                    defaults: merged_defaults,
532                    keys: merged_keys,
533                })
534            }
535            None => Ok(default_schema),
536        }
537    }
538
539    /// Merge two schemas together, combining key overrides when possible.
540    pub fn merge(&self, other: &InternalSchema) -> Result<InternalSchema, SchemaError> {
541        if self.defaults != other.defaults {
542            return Err(SchemaError::InvalidSchema {
543                reason: "Cannot merge schemas with differing defaults".to_string(),
544            });
545        }
546
547        let mut keys = self.keys.clone();
548
549        for (key, other_value_types) in &other.keys {
550            if let Some(existing) = keys.get(key).cloned() {
551                let merged = Self::merge_override_value_types(key, &existing, other_value_types)?;
552                keys.insert(key.clone(), merged);
553            } else {
554                keys.insert(key.clone(), other_value_types.clone());
555            }
556        }
557
558        Ok(InternalSchema {
559            defaults: self.defaults.clone(),
560            keys,
561        })
562    }
563
564    fn merge_override_value_types(
565        key: &str,
566        left: &ValueTypes,
567        right: &ValueTypes,
568    ) -> Result<ValueTypes, SchemaError> {
569        Ok(ValueTypes {
570            string: Self::merge_string_override(key, left.string.as_ref(), right.string.as_ref())?,
571            float: Self::merge_float_override(key, left.float.as_ref(), right.float.as_ref())?,
572            int: Self::merge_int_override(key, left.int.as_ref(), right.int.as_ref())?,
573            boolean: Self::merge_bool_override(key, left.boolean.as_ref(), right.boolean.as_ref())?,
574            float_list: Self::merge_float_list_override(
575                key,
576                left.float_list.as_ref(),
577                right.float_list.as_ref(),
578            )?,
579            sparse_vector: Self::merge_sparse_vector_override(
580                key,
581                left.sparse_vector.as_ref(),
582                right.sparse_vector.as_ref(),
583            )?,
584        })
585    }
586
587    fn merge_string_override(
588        key: &str,
589        left: Option<&StringValueType>,
590        right: Option<&StringValueType>,
591    ) -> Result<Option<StringValueType>, SchemaError> {
592        match (left, right) {
593            (Some(l), Some(r)) => Ok(Some(StringValueType {
594                string_inverted_index: Self::merge_index_or_error(
595                    l.string_inverted_index.as_ref(),
596                    r.string_inverted_index.as_ref(),
597                    &format!("key '{key}' string.string_inverted_index"),
598                )?,
599                fts_index: Self::merge_index_or_error(
600                    l.fts_index.as_ref(),
601                    r.fts_index.as_ref(),
602                    &format!("key '{key}' string.fts_index"),
603                )?,
604            })),
605            (Some(l), None) => Ok(Some(l.clone())),
606            (None, Some(r)) => Ok(Some(r.clone())),
607            (None, None) => Ok(None),
608        }
609    }
610
611    fn merge_float_override(
612        key: &str,
613        left: Option<&FloatValueType>,
614        right: Option<&FloatValueType>,
615    ) -> Result<Option<FloatValueType>, SchemaError> {
616        match (left, right) {
617            (Some(l), Some(r)) => Ok(Some(FloatValueType {
618                float_inverted_index: Self::merge_index_or_error(
619                    l.float_inverted_index.as_ref(),
620                    r.float_inverted_index.as_ref(),
621                    &format!("key '{key}' float.float_inverted_index"),
622                )?,
623            })),
624            (Some(l), None) => Ok(Some(l.clone())),
625            (None, Some(r)) => Ok(Some(r.clone())),
626            (None, None) => Ok(None),
627        }
628    }
629
630    fn merge_int_override(
631        key: &str,
632        left: Option<&IntValueType>,
633        right: Option<&IntValueType>,
634    ) -> Result<Option<IntValueType>, SchemaError> {
635        match (left, right) {
636            (Some(l), Some(r)) => Ok(Some(IntValueType {
637                int_inverted_index: Self::merge_index_or_error(
638                    l.int_inverted_index.as_ref(),
639                    r.int_inverted_index.as_ref(),
640                    &format!("key '{key}' int.int_inverted_index"),
641                )?,
642            })),
643            (Some(l), None) => Ok(Some(l.clone())),
644            (None, Some(r)) => Ok(Some(r.clone())),
645            (None, None) => Ok(None),
646        }
647    }
648
649    fn merge_bool_override(
650        key: &str,
651        left: Option<&BoolValueType>,
652        right: Option<&BoolValueType>,
653    ) -> Result<Option<BoolValueType>, SchemaError> {
654        match (left, right) {
655            (Some(l), Some(r)) => Ok(Some(BoolValueType {
656                bool_inverted_index: Self::merge_index_or_error(
657                    l.bool_inverted_index.as_ref(),
658                    r.bool_inverted_index.as_ref(),
659                    &format!("key '{key}' bool.bool_inverted_index"),
660                )?,
661            })),
662            (Some(l), None) => Ok(Some(l.clone())),
663            (None, Some(r)) => Ok(Some(r.clone())),
664            (None, None) => Ok(None),
665        }
666    }
667
668    fn merge_float_list_override(
669        key: &str,
670        left: Option<&FloatListValueType>,
671        right: Option<&FloatListValueType>,
672    ) -> Result<Option<FloatListValueType>, SchemaError> {
673        match (left, right) {
674            (Some(l), Some(r)) => Ok(Some(FloatListValueType {
675                vector_index: Self::merge_index_or_error(
676                    l.vector_index.as_ref(),
677                    r.vector_index.as_ref(),
678                    &format!("key '{key}' float_list.vector_index"),
679                )?,
680            })),
681            (Some(l), None) => Ok(Some(l.clone())),
682            (None, Some(r)) => Ok(Some(r.clone())),
683            (None, None) => Ok(None),
684        }
685    }
686
687    fn merge_sparse_vector_override(
688        key: &str,
689        left: Option<&SparseVectorValueType>,
690        right: Option<&SparseVectorValueType>,
691    ) -> Result<Option<SparseVectorValueType>, SchemaError> {
692        match (left, right) {
693            (Some(l), Some(r)) => Ok(Some(SparseVectorValueType {
694                sparse_vector_index: Self::merge_index_or_error(
695                    l.sparse_vector_index.as_ref(),
696                    r.sparse_vector_index.as_ref(),
697                    &format!("key '{key}' sparse_vector.sparse_vector_index"),
698                )?,
699            })),
700            (Some(l), None) => Ok(Some(l.clone())),
701            (None, Some(r)) => Ok(Some(r.clone())),
702            (None, None) => Ok(None),
703        }
704    }
705
706    fn merge_index_or_error<T: Clone + PartialEq>(
707        left: Option<&T>,
708        right: Option<&T>,
709        context: &str,
710    ) -> Result<Option<T>, SchemaError> {
711        match (left, right) {
712            (Some(l), Some(r)) => {
713                if l == r {
714                    Ok(Some(l.clone()))
715                } else {
716                    Err(SchemaError::InvalidSchema {
717                        reason: format!("Conflicting configuration for {context}"),
718                    })
719                }
720            }
721            (Some(l), None) => Ok(Some(l.clone())),
722            (None, Some(r)) => Ok(Some(r.clone())),
723            (None, None) => Ok(None),
724        }
725    }
726
727    /// Merge two ValueTypes with field-level merging
728    /// User values take precedence over default values
729    fn merge_value_types(default: &ValueTypes, user: &ValueTypes) -> Result<ValueTypes, String> {
730        Ok(ValueTypes {
731            string: Self::merge_string_type(default.string.as_ref(), user.string.as_ref())?,
732            float: Self::merge_float_type(default.float.as_ref(), user.float.as_ref())?,
733            int: Self::merge_int_type(default.int.as_ref(), user.int.as_ref())?,
734            boolean: Self::merge_bool_type(default.boolean.as_ref(), user.boolean.as_ref())?,
735            float_list: Self::merge_float_list_type(
736                default.float_list.as_ref(),
737                user.float_list.as_ref(),
738            )?,
739            sparse_vector: Self::merge_sparse_vector_type(
740                default.sparse_vector.as_ref(),
741                user.sparse_vector.as_ref(),
742            )?,
743        })
744    }
745
746    /// Merge StringValueType configurations
747    fn merge_string_type(
748        default: Option<&StringValueType>,
749        user: Option<&StringValueType>,
750    ) -> Result<Option<StringValueType>, String> {
751        match (default, user) {
752            (Some(default), Some(user)) => Ok(Some(StringValueType {
753                string_inverted_index: Self::merge_string_inverted_index_type(
754                    default.string_inverted_index.as_ref(),
755                    user.string_inverted_index.as_ref(),
756                )?,
757                fts_index: Self::merge_fts_index_type(
758                    default.fts_index.as_ref(),
759                    user.fts_index.as_ref(),
760                )?,
761            })),
762            (Some(default), None) => Ok(Some(default.clone())),
763            (None, Some(user)) => Ok(Some(user.clone())),
764            (None, None) => Ok(None),
765        }
766    }
767
768    /// Merge FloatValueType configurations
769    fn merge_float_type(
770        default: Option<&FloatValueType>,
771        user: Option<&FloatValueType>,
772    ) -> Result<Option<FloatValueType>, String> {
773        match (default, user) {
774            (Some(default), Some(user)) => Ok(Some(FloatValueType {
775                float_inverted_index: Self::merge_float_inverted_index_type(
776                    default.float_inverted_index.as_ref(),
777                    user.float_inverted_index.as_ref(),
778                )?,
779            })),
780            (Some(default), None) => Ok(Some(default.clone())),
781            (None, Some(user)) => Ok(Some(user.clone())),
782            (None, None) => Ok(None),
783        }
784    }
785
786    /// Merge IntValueType configurations
787    fn merge_int_type(
788        default: Option<&IntValueType>,
789        user: Option<&IntValueType>,
790    ) -> Result<Option<IntValueType>, String> {
791        match (default, user) {
792            (Some(default), Some(user)) => Ok(Some(IntValueType {
793                int_inverted_index: Self::merge_int_inverted_index_type(
794                    default.int_inverted_index.as_ref(),
795                    user.int_inverted_index.as_ref(),
796                )?,
797            })),
798            (Some(default), None) => Ok(Some(default.clone())),
799            (None, Some(user)) => Ok(Some(user.clone())),
800            (None, None) => Ok(None),
801        }
802    }
803
804    /// Merge BoolValueType configurations
805    fn merge_bool_type(
806        default: Option<&BoolValueType>,
807        user: Option<&BoolValueType>,
808    ) -> Result<Option<BoolValueType>, String> {
809        match (default, user) {
810            (Some(default), Some(user)) => Ok(Some(BoolValueType {
811                bool_inverted_index: Self::merge_bool_inverted_index_type(
812                    default.bool_inverted_index.as_ref(),
813                    user.bool_inverted_index.as_ref(),
814                )?,
815            })),
816            (Some(default), None) => Ok(Some(default.clone())),
817            (None, Some(user)) => Ok(Some(user.clone())),
818            (None, None) => Ok(None),
819        }
820    }
821
822    /// Merge FloatListValueType configurations
823    fn merge_float_list_type(
824        default: Option<&FloatListValueType>,
825        user: Option<&FloatListValueType>,
826    ) -> Result<Option<FloatListValueType>, String> {
827        match (default, user) {
828            (Some(default), Some(user)) => Ok(Some(FloatListValueType {
829                vector_index: Self::merge_vector_index_type(
830                    default.vector_index.as_ref(),
831                    user.vector_index.as_ref(),
832                )?,
833            })),
834            (Some(default), None) => Ok(Some(default.clone())),
835            (None, Some(user)) => Ok(Some(user.clone())),
836            (None, None) => Ok(None),
837        }
838    }
839
840    /// Merge SparseVectorValueType configurations
841    fn merge_sparse_vector_type(
842        default: Option<&SparseVectorValueType>,
843        user: Option<&SparseVectorValueType>,
844    ) -> Result<Option<SparseVectorValueType>, String> {
845        match (default, user) {
846            (Some(default), Some(user)) => Ok(Some(SparseVectorValueType {
847                sparse_vector_index: Self::merge_sparse_vector_index_type(
848                    default.sparse_vector_index.as_ref(),
849                    user.sparse_vector_index.as_ref(),
850                )?,
851            })),
852            (Some(default), None) => Ok(Some(default.clone())),
853            (None, Some(user)) => Ok(Some(user.clone())),
854            (None, None) => Ok(None),
855        }
856    }
857
858    /// Merge individual index type configurations
859    fn merge_string_inverted_index_type(
860        default: Option<&StringInvertedIndexType>,
861        user: Option<&StringInvertedIndexType>,
862    ) -> Result<Option<StringInvertedIndexType>, String> {
863        match (default, user) {
864            (Some(_default), Some(user)) => {
865                Ok(Some(StringInvertedIndexType {
866                    enabled: user.enabled,       // User enabled state takes precedence
867                    config: user.config.clone(), // User config takes precedence
868                }))
869            }
870            (Some(default), None) => Ok(Some(default.clone())),
871            (None, Some(user)) => Ok(Some(user.clone())),
872            (None, None) => Ok(None),
873        }
874    }
875
876    fn merge_fts_index_type(
877        default: Option<&FtsIndexType>,
878        user: Option<&FtsIndexType>,
879    ) -> Result<Option<FtsIndexType>, String> {
880        match (default, user) {
881            (Some(_default), Some(user)) => Ok(Some(FtsIndexType {
882                enabled: user.enabled,
883                config: user.config.clone(),
884            })),
885            (Some(default), None) => Ok(Some(default.clone())),
886            (None, Some(user)) => Ok(Some(user.clone())),
887            (None, None) => Ok(None),
888        }
889    }
890
891    fn merge_float_inverted_index_type(
892        default: Option<&FloatInvertedIndexType>,
893        user: Option<&FloatInvertedIndexType>,
894    ) -> Result<Option<FloatInvertedIndexType>, String> {
895        match (default, user) {
896            (Some(_default), Some(user)) => Ok(Some(FloatInvertedIndexType {
897                enabled: user.enabled,
898                config: user.config.clone(),
899            })),
900            (Some(default), None) => Ok(Some(default.clone())),
901            (None, Some(user)) => Ok(Some(user.clone())),
902            (None, None) => Ok(None),
903        }
904    }
905
906    fn merge_int_inverted_index_type(
907        default: Option<&IntInvertedIndexType>,
908        user: Option<&IntInvertedIndexType>,
909    ) -> Result<Option<IntInvertedIndexType>, String> {
910        match (default, user) {
911            (Some(_default), Some(user)) => Ok(Some(IntInvertedIndexType {
912                enabled: user.enabled,
913                config: user.config.clone(),
914            })),
915            (Some(default), None) => Ok(Some(default.clone())),
916            (None, Some(user)) => Ok(Some(user.clone())),
917            (None, None) => Ok(None),
918        }
919    }
920
921    fn merge_bool_inverted_index_type(
922        default: Option<&BoolInvertedIndexType>,
923        user: Option<&BoolInvertedIndexType>,
924    ) -> Result<Option<BoolInvertedIndexType>, String> {
925        match (default, user) {
926            (Some(_default), Some(user)) => Ok(Some(BoolInvertedIndexType {
927                enabled: user.enabled,
928                config: user.config.clone(),
929            })),
930            (Some(default), None) => Ok(Some(default.clone())),
931            (None, Some(user)) => Ok(Some(user.clone())),
932            (None, None) => Ok(None),
933        }
934    }
935
936    fn merge_vector_index_type(
937        default: Option<&VectorIndexType>,
938        user: Option<&VectorIndexType>,
939    ) -> Result<Option<VectorIndexType>, String> {
940        match (default, user) {
941            (Some(default), Some(user)) => {
942                Ok(Some(VectorIndexType {
943                    enabled: user.enabled, // User enabled state takes precedence
944                    config: Self::merge_vector_index_config(&default.config, &user.config)?,
945                }))
946            }
947            (Some(default), None) => Ok(Some(default.clone())),
948            (None, Some(user)) => Ok(Some(user.clone())),
949            (None, None) => Ok(None),
950        }
951    }
952
953    fn merge_sparse_vector_index_type(
954        default: Option<&SparseVectorIndexType>,
955        user: Option<&SparseVectorIndexType>,
956    ) -> Result<Option<SparseVectorIndexType>, String> {
957        match (default, user) {
958            (Some(default), Some(user)) => Ok(Some(SparseVectorIndexType {
959                enabled: user.enabled,
960                config: Self::merge_sparse_vector_index_config(&default.config, &user.config)?,
961            })),
962            (Some(default), None) => Ok(Some(default.clone())),
963            (None, Some(user)) => Ok(Some(user.clone())),
964            (None, None) => Ok(None),
965        }
966    }
967
968    /// Merge VectorIndexConfig with field-level merging
969    fn merge_vector_index_config(
970        default: &VectorIndexConfig,
971        user: &VectorIndexConfig,
972    ) -> Result<VectorIndexConfig, String> {
973        Ok(VectorIndexConfig {
974            space: user.space.clone().or(default.space.clone()),
975            embedding_function: user
976                .embedding_function
977                .clone()
978                .or(default.embedding_function.clone()),
979            source_key: user.source_key.clone().or(default.source_key.clone()),
980            hnsw: Self::merge_hnsw_configs(default.hnsw.as_ref(), user.hnsw.as_ref()),
981            spann: Self::merge_spann_configs(default.spann.as_ref(), user.spann.as_ref()),
982        })
983    }
984
985    /// Merge SparseVectorIndexConfig with field-level merging
986    fn merge_sparse_vector_index_config(
987        default: &SparseVectorIndexConfig,
988        user: &SparseVectorIndexConfig,
989    ) -> Result<SparseVectorIndexConfig, String> {
990        Ok(SparseVectorIndexConfig {
991            embedding_function: user
992                .embedding_function
993                .clone()
994                .or(default.embedding_function.clone()),
995            source_key: user.source_key.clone().or(default.source_key.clone()),
996            bm25: user.bm25.or(default.bm25),
997        })
998    }
999
1000    /// Merge HNSW configurations with field-level merging
1001    fn merge_hnsw_configs(
1002        default_hnsw: Option<&HnswIndexConfig>,
1003        user_hnsw: Option<&HnswIndexConfig>,
1004    ) -> Option<HnswIndexConfig> {
1005        match (default_hnsw, user_hnsw) {
1006            (Some(default), Some(user)) => Some(HnswIndexConfig {
1007                ef_construction: user.ef_construction.or(default.ef_construction),
1008                max_neighbors: user.max_neighbors.or(default.max_neighbors),
1009                ef_search: user.ef_search.or(default.ef_search),
1010                num_threads: user.num_threads.or(default.num_threads),
1011                batch_size: user.batch_size.or(default.batch_size),
1012                sync_threshold: user.sync_threshold.or(default.sync_threshold),
1013                resize_factor: user.resize_factor.or(default.resize_factor),
1014            }),
1015            (Some(default), None) => Some(default.clone()),
1016            (None, Some(user)) => Some(user.clone()),
1017            (None, None) => None,
1018        }
1019    }
1020
1021    /// Merge SPANN configurations with field-level merging
1022    fn merge_spann_configs(
1023        default_spann: Option<&SpannIndexConfig>,
1024        user_spann: Option<&SpannIndexConfig>,
1025    ) -> Option<SpannIndexConfig> {
1026        match (default_spann, user_spann) {
1027            (Some(default), Some(user)) => Some(SpannIndexConfig {
1028                search_nprobe: user.search_nprobe.or(default.search_nprobe),
1029                search_rng_factor: user.search_rng_factor.or(default.search_rng_factor),
1030                search_rng_epsilon: user.search_rng_epsilon.or(default.search_rng_epsilon),
1031                nreplica_count: user.nreplica_count.or(default.nreplica_count),
1032                write_rng_factor: user.write_rng_factor.or(default.write_rng_factor),
1033                write_rng_epsilon: user.write_rng_epsilon.or(default.write_rng_epsilon),
1034                split_threshold: user.split_threshold.or(default.split_threshold),
1035                num_samples_kmeans: user.num_samples_kmeans.or(default.num_samples_kmeans),
1036                initial_lambda: user.initial_lambda.or(default.initial_lambda),
1037                reassign_neighbor_count: user
1038                    .reassign_neighbor_count
1039                    .or(default.reassign_neighbor_count),
1040                merge_threshold: user.merge_threshold.or(default.merge_threshold),
1041                num_centers_to_merge_to: user
1042                    .num_centers_to_merge_to
1043                    .or(default.num_centers_to_merge_to),
1044                write_nprobe: user.write_nprobe.or(default.write_nprobe),
1045                ef_construction: user.ef_construction.or(default.ef_construction),
1046                ef_search: user.ef_search.or(default.ef_search),
1047                max_neighbors: user.max_neighbors.or(default.max_neighbors),
1048            }),
1049            (Some(default), None) => Some(default.clone()),
1050            (None, Some(user)) => Some(user.clone()),
1051            (None, None) => None,
1052        }
1053    }
1054
1055    /// Reconcile InternalSchema with InternalCollectionConfiguration
1056    ///
1057    /// Simple reconciliation logic:
1058    /// 1. If collection config is default → return schema (schema is source of truth)
1059    /// 2. If collection config is non-default and schema is non-default → error (both set)
1060    /// 3. If collection config is non-default and schema is default → override schema with collection config
1061    pub fn reconcile_with_collection_config(
1062        schema: InternalSchema,
1063        collection_config: InternalCollectionConfiguration,
1064    ) -> Result<InternalSchema, String> {
1065        // 1. Check if collection config is default
1066        if collection_config.is_default() {
1067            // Collection config is default → schema is source of truth
1068            return Ok(schema);
1069        }
1070
1071        // 2. Collection config is non-default, check if schema is also non-default
1072        if !Self::is_schema_default(&schema) {
1073            // Both are non-default → error
1074            return Err(
1075                "Cannot set both collection config and schema at the same time".to_string(),
1076            );
1077        }
1078
1079        // 3. Collection config is non-default, schema is default → override schema with collection config
1080        Self::convert_collection_config_to_schema(collection_config)
1081    }
1082
1083    pub fn reconcile_schema_and_config(
1084        schema: Option<InternalSchema>,
1085        configuration: Option<InternalCollectionConfiguration>,
1086    ) -> Result<InternalSchema, String> {
1087        let reconciled_schema = Self::reconcile_with_defaults(schema)?;
1088        if let Some(config) = configuration {
1089            Self::reconcile_with_collection_config(reconciled_schema, config)
1090        } else {
1091            Ok(reconciled_schema)
1092        }
1093    }
1094
1095    /// Check if schema is default by comparing it word-by-word with new_default
1096    fn is_schema_default(schema: &InternalSchema) -> bool {
1097        // Compare with both possible default schemas (HNSW and SPANN)
1098        let default_hnsw = InternalSchema::new_default(KnnIndex::Hnsw);
1099        let default_spann = InternalSchema::new_default(KnnIndex::Spann);
1100
1101        schema == &default_hnsw || schema == &default_spann
1102    }
1103
1104    /// Convert InternalCollectionConfiguration to InternalSchema
1105    fn convert_collection_config_to_schema(
1106        collection_config: InternalCollectionConfiguration,
1107    ) -> Result<InternalSchema, String> {
1108        // Start with a default schema structure
1109        let mut schema = InternalSchema::new_default(KnnIndex::Spann); // Default to HNSW, will be overridden
1110
1111        // Convert vector index configuration
1112        let vector_config = match collection_config.vector_index {
1113            VectorIndexConfiguration::Hnsw(hnsw_config) => VectorIndexConfig {
1114                space: Some(hnsw_config.space),
1115                embedding_function: collection_config.embedding_function,
1116                source_key: Some(DOCUMENT_KEY.to_string()), // Default source key
1117                hnsw: Some(HnswIndexConfig {
1118                    ef_construction: Some(hnsw_config.ef_construction),
1119                    max_neighbors: Some(hnsw_config.max_neighbors),
1120                    ef_search: Some(hnsw_config.ef_search),
1121                    num_threads: Some(hnsw_config.num_threads),
1122                    batch_size: Some(hnsw_config.batch_size),
1123                    sync_threshold: Some(hnsw_config.sync_threshold),
1124                    resize_factor: Some(hnsw_config.resize_factor),
1125                }),
1126                spann: None,
1127            },
1128            VectorIndexConfiguration::Spann(spann_config) => VectorIndexConfig {
1129                space: Some(spann_config.space),
1130                embedding_function: collection_config.embedding_function,
1131                source_key: Some(DOCUMENT_KEY.to_string()), // Default source key
1132                hnsw: None,
1133                spann: Some(SpannIndexConfig {
1134                    search_nprobe: Some(spann_config.search_nprobe),
1135                    search_rng_factor: Some(spann_config.search_rng_factor),
1136                    search_rng_epsilon: Some(spann_config.search_rng_epsilon),
1137                    nreplica_count: Some(spann_config.nreplica_count),
1138                    write_rng_factor: Some(spann_config.write_rng_factor),
1139                    write_rng_epsilon: Some(spann_config.write_rng_epsilon),
1140                    split_threshold: Some(spann_config.split_threshold),
1141                    num_samples_kmeans: Some(spann_config.num_samples_kmeans),
1142                    initial_lambda: Some(spann_config.initial_lambda),
1143                    reassign_neighbor_count: Some(spann_config.reassign_neighbor_count),
1144                    merge_threshold: Some(spann_config.merge_threshold),
1145                    num_centers_to_merge_to: Some(spann_config.num_centers_to_merge_to),
1146                    write_nprobe: Some(spann_config.write_nprobe),
1147                    ef_construction: Some(spann_config.ef_construction),
1148                    ef_search: Some(spann_config.ef_search),
1149                    max_neighbors: Some(spann_config.max_neighbors),
1150                }),
1151            },
1152        };
1153
1154        // Update defaults (keep enabled=false, just update the config)
1155        // This serves as the template for any new float_list fields
1156        if let Some(float_list) = &mut schema.defaults.float_list {
1157            if let Some(vector_index) = &mut float_list.vector_index {
1158                vector_index.config = vector_config.clone();
1159            }
1160        }
1161
1162        // Update the vector_index in the existing #embedding key override
1163        // Keep enabled=true (already set by new_default) and update the config
1164        if let Some(embedding_types) = schema.keys.get_mut(EMBEDDING_KEY) {
1165            if let Some(float_list) = &mut embedding_types.float_list {
1166                if let Some(vector_index) = &mut float_list.vector_index {
1167                    vector_index.config = vector_config;
1168                }
1169            }
1170        }
1171
1172        Ok(schema)
1173    }
1174
1175    /// Check if a specific metadata key-value should be indexed based on schema configuration
1176    pub fn is_metadata_type_index_enabled(
1177        &self,
1178        key: &str,
1179        value_type: MetadataValueType,
1180    ) -> Result<bool, SchemaError> {
1181        let v_type = self.keys.get(key).unwrap_or(&self.defaults);
1182
1183        match value_type {
1184            MetadataValueType::Bool => match &v_type.boolean {
1185                Some(bool_type) => match &bool_type.bool_inverted_index {
1186                    Some(bool_inverted_index) => Ok(bool_inverted_index.enabled),
1187                    None => Err(SchemaError::MissingIndexConfiguration {
1188                        key: key.to_string(),
1189                        value_type: "bool".to_string(),
1190                    }),
1191                },
1192                None => match &self.defaults.boolean {
1193                    Some(bool_type) => match &bool_type.bool_inverted_index {
1194                        Some(bool_inverted_index) => Ok(bool_inverted_index.enabled),
1195                        None => Err(SchemaError::MissingIndexConfiguration {
1196                            key: key.to_string(),
1197                            value_type: "bool".to_string(),
1198                        }),
1199                    },
1200                    None => Err(SchemaError::MissingIndexConfiguration {
1201                        key: key.to_string(),
1202                        value_type: "bool".to_string(),
1203                    }),
1204                },
1205            },
1206            MetadataValueType::Int => match &v_type.int {
1207                Some(int_type) => match &int_type.int_inverted_index {
1208                    Some(int_inverted_index) => Ok(int_inverted_index.enabled),
1209                    None => Err(SchemaError::MissingIndexConfiguration {
1210                        key: key.to_string(),
1211                        value_type: "int".to_string(),
1212                    }),
1213                },
1214                None => match &self.defaults.int {
1215                    Some(int_type) => match &int_type.int_inverted_index {
1216                        Some(int_inverted_index) => Ok(int_inverted_index.enabled),
1217                        None => Err(SchemaError::MissingIndexConfiguration {
1218                            key: key.to_string(),
1219                            value_type: "int".to_string(),
1220                        }),
1221                    },
1222                    None => Err(SchemaError::MissingIndexConfiguration {
1223                        key: key.to_string(),
1224                        value_type: "int".to_string(),
1225                    }),
1226                },
1227            },
1228            MetadataValueType::Float => match &v_type.float {
1229                Some(float_type) => match &float_type.float_inverted_index {
1230                    Some(float_inverted_index) => Ok(float_inverted_index.enabled),
1231                    None => Err(SchemaError::MissingIndexConfiguration {
1232                        key: key.to_string(),
1233                        value_type: "float".to_string(),
1234                    }),
1235                },
1236                None => match &self.defaults.float {
1237                    Some(float_type) => match &float_type.float_inverted_index {
1238                        Some(float_inverted_index) => Ok(float_inverted_index.enabled),
1239                        None => Err(SchemaError::MissingIndexConfiguration {
1240                            key: key.to_string(),
1241                            value_type: "float".to_string(),
1242                        }),
1243                    },
1244                    None => Err(SchemaError::MissingIndexConfiguration {
1245                        key: key.to_string(),
1246                        value_type: "float".to_string(),
1247                    }),
1248                },
1249            },
1250            MetadataValueType::Str => match &v_type.string {
1251                Some(string_type) => match &string_type.string_inverted_index {
1252                    Some(string_inverted_index) => Ok(string_inverted_index.enabled),
1253                    None => Err(SchemaError::MissingIndexConfiguration {
1254                        key: key.to_string(),
1255                        value_type: "string".to_string(),
1256                    }),
1257                },
1258                None => match &self.defaults.string {
1259                    Some(string_type) => match &string_type.string_inverted_index {
1260                        Some(string_inverted_index) => Ok(string_inverted_index.enabled),
1261                        None => Err(SchemaError::MissingIndexConfiguration {
1262                            key: key.to_string(),
1263                            value_type: "string".to_string(),
1264                        }),
1265                    },
1266                    None => Err(SchemaError::MissingIndexConfiguration {
1267                        key: key.to_string(),
1268                        value_type: "string".to_string(),
1269                    }),
1270                },
1271            },
1272            MetadataValueType::SparseVector => match &v_type.sparse_vector {
1273                Some(sparse_vector_type) => match &sparse_vector_type.sparse_vector_index {
1274                    Some(sparse_vector_index) => Ok(sparse_vector_index.enabled),
1275                    None => Err(SchemaError::MissingIndexConfiguration {
1276                        key: key.to_string(),
1277                        value_type: "sparse_vector".to_string(),
1278                    }),
1279                },
1280                None => match &self.defaults.sparse_vector {
1281                    Some(sparse_vector_type) => match &sparse_vector_type.sparse_vector_index {
1282                        Some(sparse_vector_index) => Ok(sparse_vector_index.enabled),
1283                        None => Err(SchemaError::MissingIndexConfiguration {
1284                            key: key.to_string(),
1285                            value_type: "sparse_vector".to_string(),
1286                        }),
1287                    },
1288                    None => Err(SchemaError::MissingIndexConfiguration {
1289                        key: key.to_string(),
1290                        value_type: "sparse_vector".to_string(),
1291                    }),
1292                },
1293            },
1294        }
1295    }
1296
1297    pub fn is_metadata_where_indexing_enabled(
1298        &self,
1299        where_clause: &Where,
1300    ) -> Result<(), FilterValidationError> {
1301        match where_clause {
1302            Where::Composite(composite) => {
1303                for child in &composite.children {
1304                    self.is_metadata_where_indexing_enabled(child)?;
1305                }
1306                Ok(())
1307            }
1308            Where::Document(_) => Ok(()),
1309            Where::Metadata(expression) => {
1310                let value_type = match &expression.comparison {
1311                    MetadataComparison::Primitive(_, value) => value.value_type(),
1312                    MetadataComparison::Set(_, set_value) => set_value.value_type(),
1313                };
1314                let is_enabled = self
1315                    .is_metadata_type_index_enabled(expression.key.as_str(), value_type)
1316                    .map_err(FilterValidationError::Schema)?;
1317                if !is_enabled {
1318                    return Err(FilterValidationError::IndexingDisabled {
1319                        key: expression.key.clone(),
1320                        value_type,
1321                    });
1322                }
1323                Ok(())
1324            }
1325        }
1326    }
1327
1328    pub fn is_knn_key_indexing_enabled(
1329        &self,
1330        key: &str,
1331        query: &QueryVector,
1332    ) -> Result<(), FilterValidationError> {
1333        match query {
1334            QueryVector::Sparse(_) => {
1335                let is_enabled = self
1336                    .is_metadata_type_index_enabled(key, MetadataValueType::SparseVector)
1337                    .map_err(FilterValidationError::Schema)?;
1338                if !is_enabled {
1339                    return Err(FilterValidationError::IndexingDisabled {
1340                        key: key.to_string(),
1341                        value_type: MetadataValueType::SparseVector,
1342                    });
1343                }
1344                Ok(())
1345            }
1346            QueryVector::Dense(_) => {
1347                // TODO: once we allow turning off dense vector indexing, we need to check if the key is enabled
1348                // Dense vectors are always indexed
1349                Ok(())
1350            }
1351        }
1352    }
1353
1354    pub fn ensure_key_from_metadata(&mut self, key: &str, value_type: MetadataValueType) -> bool {
1355        let value_types = self.keys.entry(key.to_string()).or_default();
1356        match value_type {
1357            MetadataValueType::Bool => {
1358                if value_types.boolean.is_none() {
1359                    value_types.boolean = self.defaults.boolean.clone();
1360                    return true;
1361                }
1362            }
1363            MetadataValueType::Int => {
1364                if value_types.int.is_none() {
1365                    value_types.int = self.defaults.int.clone();
1366                    return true;
1367                }
1368            }
1369            MetadataValueType::Float => {
1370                if value_types.float.is_none() {
1371                    value_types.float = self.defaults.float.clone();
1372                    return true;
1373                }
1374            }
1375            MetadataValueType::Str => {
1376                if value_types.string.is_none() {
1377                    value_types.string = self.defaults.string.clone();
1378                    return true;
1379                }
1380            }
1381            MetadataValueType::SparseVector => {
1382                if value_types.sparse_vector.is_none() {
1383                    value_types.sparse_vector = self.defaults.sparse_vector.clone();
1384                    return true;
1385                }
1386            }
1387        }
1388        false
1389    }
1390}
1391
1392// ============================================================================
1393// INDEX CONFIGURATION STRUCTURES
1394// ============================================================================
1395
1396#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1397#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1398#[serde(deny_unknown_fields)]
1399pub struct VectorIndexConfig {
1400    /// Vector space for similarity calculation (cosine, l2, ip)
1401    #[serde(skip_serializing_if = "Option::is_none")]
1402    pub space: Option<Space>,
1403    /// Embedding function configuration
1404    #[serde(skip_serializing_if = "Option::is_none")]
1405    pub embedding_function: Option<EmbeddingFunctionConfiguration>,
1406    /// Key to source the vector from
1407    #[serde(skip_serializing_if = "Option::is_none")]
1408    pub source_key: Option<String>,
1409    /// HNSW algorithm configuration
1410    #[serde(skip_serializing_if = "Option::is_none")]
1411    pub hnsw: Option<HnswIndexConfig>,
1412    /// SPANN algorithm configuration
1413    #[serde(skip_serializing_if = "Option::is_none")]
1414    pub spann: Option<SpannIndexConfig>,
1415}
1416
1417/// Configuration for HNSW vector index algorithm parameters
1418#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1419#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1420#[serde(deny_unknown_fields)]
1421pub struct HnswIndexConfig {
1422    #[serde(skip_serializing_if = "Option::is_none")]
1423    pub ef_construction: Option<usize>,
1424    #[serde(skip_serializing_if = "Option::is_none")]
1425    pub max_neighbors: Option<usize>,
1426    #[serde(skip_serializing_if = "Option::is_none")]
1427    pub ef_search: Option<usize>,
1428    #[serde(skip_serializing_if = "Option::is_none")]
1429    pub num_threads: Option<usize>,
1430    #[serde(skip_serializing_if = "Option::is_none")]
1431    pub batch_size: Option<usize>,
1432    #[serde(skip_serializing_if = "Option::is_none")]
1433    pub sync_threshold: Option<usize>,
1434    #[serde(skip_serializing_if = "Option::is_none")]
1435    pub resize_factor: Option<f64>,
1436}
1437
1438/// Configuration for SPANN vector index algorithm parameters
1439#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1440#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1441#[serde(deny_unknown_fields)]
1442pub struct SpannIndexConfig {
1443    #[serde(skip_serializing_if = "Option::is_none")]
1444    pub search_nprobe: Option<u32>,
1445    #[serde(skip_serializing_if = "Option::is_none")]
1446    pub search_rng_factor: Option<f32>,
1447    #[serde(skip_serializing_if = "Option::is_none")]
1448    pub search_rng_epsilon: Option<f32>,
1449    #[serde(skip_serializing_if = "Option::is_none")]
1450    pub nreplica_count: Option<u32>,
1451    #[serde(skip_serializing_if = "Option::is_none")]
1452    pub write_rng_factor: Option<f32>,
1453    #[serde(skip_serializing_if = "Option::is_none")]
1454    pub write_rng_epsilon: Option<f32>,
1455    #[serde(skip_serializing_if = "Option::is_none")]
1456    pub split_threshold: Option<u32>,
1457    #[serde(skip_serializing_if = "Option::is_none")]
1458    pub num_samples_kmeans: Option<usize>,
1459    #[serde(skip_serializing_if = "Option::is_none")]
1460    pub initial_lambda: Option<f32>,
1461    #[serde(skip_serializing_if = "Option::is_none")]
1462    pub reassign_neighbor_count: Option<u32>,
1463    #[serde(skip_serializing_if = "Option::is_none")]
1464    pub merge_threshold: Option<u32>,
1465    #[serde(skip_serializing_if = "Option::is_none")]
1466    pub num_centers_to_merge_to: Option<u32>,
1467    #[serde(skip_serializing_if = "Option::is_none")]
1468    pub write_nprobe: Option<u32>,
1469    #[serde(skip_serializing_if = "Option::is_none")]
1470    pub ef_construction: Option<usize>,
1471    #[serde(skip_serializing_if = "Option::is_none")]
1472    pub ef_search: Option<usize>,
1473    #[serde(skip_serializing_if = "Option::is_none")]
1474    pub max_neighbors: Option<usize>,
1475}
1476
1477impl SpannIndexConfig {
1478    pub fn into_internal_configuration(
1479        self,
1480        vector_space: Option<Space>,
1481    ) -> InternalSpannConfiguration {
1482        InternalSpannConfiguration {
1483            search_nprobe: self.search_nprobe.unwrap_or(default_search_nprobe()),
1484            search_rng_factor: self
1485                .search_rng_factor
1486                .unwrap_or(default_search_rng_factor()),
1487            search_rng_epsilon: self
1488                .search_rng_epsilon
1489                .unwrap_or(default_search_rng_epsilon()),
1490            nreplica_count: self.nreplica_count.unwrap_or(default_nreplica_count()),
1491            write_rng_factor: self.write_rng_factor.unwrap_or(default_write_rng_factor()),
1492            write_rng_epsilon: self
1493                .write_rng_epsilon
1494                .unwrap_or(default_write_rng_epsilon()),
1495            split_threshold: self.split_threshold.unwrap_or(default_split_threshold()),
1496            num_samples_kmeans: self
1497                .num_samples_kmeans
1498                .unwrap_or(default_num_samples_kmeans()),
1499            initial_lambda: self.initial_lambda.unwrap_or(default_initial_lambda()),
1500            reassign_neighbor_count: self
1501                .reassign_neighbor_count
1502                .unwrap_or(default_reassign_neighbor_count()),
1503            merge_threshold: self.merge_threshold.unwrap_or(default_merge_threshold()),
1504            num_centers_to_merge_to: self
1505                .num_centers_to_merge_to
1506                .unwrap_or(default_num_centers_to_merge_to()),
1507            write_nprobe: self.write_nprobe.unwrap_or(default_write_nprobe()),
1508            ef_construction: self
1509                .ef_construction
1510                .unwrap_or(default_construction_ef_spann()),
1511            ef_search: self.ef_search.unwrap_or(default_search_ef_spann()),
1512            max_neighbors: self.max_neighbors.unwrap_or(default_m_spann()),
1513            space: vector_space.unwrap_or(default_space()),
1514        }
1515    }
1516}
1517
1518#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1519#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1520#[serde(deny_unknown_fields)]
1521pub struct SparseVectorIndexConfig {
1522    /// Embedding function configuration
1523    #[serde(skip_serializing_if = "Option::is_none")]
1524    pub embedding_function: Option<EmbeddingFunctionConfiguration>,
1525    /// Key to source the sparse vector from
1526    #[serde(skip_serializing_if = "Option::is_none")]
1527    pub source_key: Option<String>,
1528    /// Whether this embedding is BM25
1529    #[serde(skip_serializing_if = "Option::is_none")]
1530    pub bm25: Option<bool>,
1531}
1532
1533#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1534#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1535#[serde(deny_unknown_fields)]
1536pub struct FtsIndexConfig {
1537    // FTS index typically has no additional parameters
1538}
1539
1540#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1541#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1542#[serde(deny_unknown_fields)]
1543pub struct StringInvertedIndexConfig {
1544    // String inverted index typically has no additional parameters
1545}
1546
1547#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1548#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1549#[serde(deny_unknown_fields)]
1550pub struct IntInvertedIndexConfig {
1551    // Integer inverted index typically has no additional parameters
1552}
1553
1554#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1555#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1556#[serde(deny_unknown_fields)]
1557pub struct FloatInvertedIndexConfig {
1558    // Float inverted index typically has no additional parameters
1559}
1560
1561#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
1562#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
1563#[serde(deny_unknown_fields)]
1564pub struct BoolInvertedIndexConfig {
1565    // Boolean inverted index typically has no additional parameters
1566}
1567
1568#[cfg(test)]
1569mod tests {
1570    use super::*;
1571    use crate::hnsw_configuration::Space;
1572    use crate::metadata::SparseVector;
1573    use crate::{InternalHnswConfiguration, InternalSpannConfiguration};
1574
1575    #[test]
1576    fn test_reconcile_with_defaults_none_user_schema() {
1577        // Test that when no user schema is provided, we get the default schema
1578        let result = InternalSchema::reconcile_with_defaults(None).unwrap();
1579        let expected = InternalSchema::new_default(KnnIndex::Spann);
1580        assert_eq!(result, expected);
1581    }
1582
1583    #[test]
1584    fn test_reconcile_with_defaults_empty_user_schema() {
1585        // Test merging with an empty user schema
1586        let user_schema = InternalSchema {
1587            defaults: ValueTypes::default(),
1588            keys: HashMap::new(),
1589        };
1590
1591        let result = InternalSchema::reconcile_with_defaults(Some(user_schema)).unwrap();
1592        let expected = InternalSchema::new_default(KnnIndex::Spann);
1593        assert_eq!(result, expected);
1594    }
1595
1596    #[test]
1597    fn test_reconcile_with_defaults_user_overrides_string_enabled() {
1598        // Test that user can override string inverted index enabled state
1599        let mut user_schema = InternalSchema {
1600            defaults: ValueTypes::default(),
1601            keys: HashMap::new(),
1602        };
1603
1604        user_schema.defaults.string = Some(StringValueType {
1605            string_inverted_index: Some(StringInvertedIndexType {
1606                enabled: false, // Override default (true) to false
1607                config: StringInvertedIndexConfig {},
1608            }),
1609            fts_index: None,
1610        });
1611
1612        let result = InternalSchema::reconcile_with_defaults(Some(user_schema)).unwrap();
1613
1614        // Check that the user override took precedence
1615        assert!(
1616            !result
1617                .defaults
1618                .string
1619                .as_ref()
1620                .unwrap()
1621                .string_inverted_index
1622                .as_ref()
1623                .unwrap()
1624                .enabled
1625        );
1626        // Check that other defaults are still present
1627        assert!(result.defaults.float.is_some());
1628        assert!(result.defaults.int.is_some());
1629    }
1630
1631    #[test]
1632    fn test_reconcile_with_defaults_user_overrides_vector_config() {
1633        // Test field-level merging for vector configurations
1634        let mut user_schema = InternalSchema {
1635            defaults: ValueTypes::default(),
1636            keys: HashMap::new(),
1637        };
1638
1639        user_schema.defaults.float_list = Some(FloatListValueType {
1640            vector_index: Some(VectorIndexType {
1641                enabled: true, // Enable vector index (default is false)
1642                config: VectorIndexConfig {
1643                    space: Some(Space::L2),                     // Override default space
1644                    embedding_function: None,                   // Will use default
1645                    source_key: Some("custom_key".to_string()), // Override default
1646                    hnsw: Some(HnswIndexConfig {
1647                        ef_construction: Some(500), // Override default
1648                        max_neighbors: None,        // Will use default
1649                        ef_search: None,            // Will use default
1650                        num_threads: None,
1651                        batch_size: None,
1652                        sync_threshold: None,
1653                        resize_factor: None,
1654                    }),
1655                    spann: None,
1656                },
1657            }),
1658        });
1659
1660        // Use HNSW defaults for this test so we have HNSW config to merge with
1661        let result = {
1662            let default_schema = InternalSchema::new_default(KnnIndex::Hnsw);
1663            let merged_defaults =
1664                InternalSchema::merge_value_types(&default_schema.defaults, &user_schema.defaults)
1665                    .unwrap();
1666            let mut merged_keys = default_schema.keys.clone();
1667            for (key, user_value_types) in user_schema.keys {
1668                if let Some(default_value_types) = merged_keys.get(&key) {
1669                    let merged_value_types =
1670                        InternalSchema::merge_value_types(default_value_types, &user_value_types)
1671                            .unwrap();
1672                    merged_keys.insert(key, merged_value_types);
1673                } else {
1674                    merged_keys.insert(key, user_value_types);
1675                }
1676            }
1677            InternalSchema {
1678                defaults: merged_defaults,
1679                keys: merged_keys,
1680            }
1681        };
1682
1683        let vector_config = &result
1684            .defaults
1685            .float_list
1686            .as_ref()
1687            .unwrap()
1688            .vector_index
1689            .as_ref()
1690            .unwrap()
1691            .config;
1692
1693        // Check user overrides took precedence
1694        assert_eq!(vector_config.space, Some(Space::L2));
1695        assert_eq!(vector_config.source_key, Some("custom_key".to_string()));
1696        assert_eq!(
1697            vector_config.hnsw.as_ref().unwrap().ef_construction,
1698            Some(500)
1699        );
1700
1701        // Check defaults were preserved for unspecified fields
1702        assert_eq!(
1703            vector_config.embedding_function,
1704            Some(EmbeddingFunctionConfiguration::Legacy)
1705        );
1706        // Since user provided HNSW config, the default max_neighbors should be merged in
1707        assert_eq!(
1708            vector_config.hnsw.as_ref().unwrap().max_neighbors,
1709            Some(default_m())
1710        );
1711    }
1712
1713    #[test]
1714    fn test_reconcile_with_defaults_keys() {
1715        // Test that key overrides are properly merged
1716        let mut user_schema = InternalSchema {
1717            defaults: ValueTypes::default(),
1718            keys: HashMap::new(),
1719        };
1720
1721        // Add a custom key override
1722        let custom_key_types = ValueTypes {
1723            string: Some(StringValueType {
1724                fts_index: Some(FtsIndexType {
1725                    enabled: true,
1726                    config: FtsIndexConfig {},
1727                }),
1728                string_inverted_index: Some(StringInvertedIndexType {
1729                    enabled: false,
1730                    config: StringInvertedIndexConfig {},
1731                }),
1732            }),
1733            ..Default::default()
1734        };
1735        user_schema
1736            .keys
1737            .insert("custom_key".to_string(), custom_key_types);
1738
1739        let result = InternalSchema::reconcile_with_defaults(Some(user_schema)).unwrap();
1740
1741        // Check that default key overrides are preserved
1742        assert!(result.keys.contains_key(EMBEDDING_KEY));
1743        assert!(result.keys.contains_key(DOCUMENT_KEY));
1744
1745        // Check that user key override was added
1746        assert!(result.keys.contains_key("custom_key"));
1747        let custom_override = result.keys.get("custom_key").unwrap();
1748        assert!(
1749            custom_override
1750                .string
1751                .as_ref()
1752                .unwrap()
1753                .fts_index
1754                .as_ref()
1755                .unwrap()
1756                .enabled
1757        );
1758    }
1759
1760    #[test]
1761    fn test_reconcile_with_defaults_override_existing_key() {
1762        // Test overriding an existing key override (like #embedding)
1763        let mut user_schema = InternalSchema {
1764            defaults: ValueTypes::default(),
1765            keys: HashMap::new(),
1766        };
1767
1768        // Override the #embedding key with custom settings
1769        let embedding_override = ValueTypes {
1770            float_list: Some(FloatListValueType {
1771                vector_index: Some(VectorIndexType {
1772                    enabled: false, // Override default enabled=true to false
1773                    config: VectorIndexConfig {
1774                        space: Some(Space::Ip), // Override default space
1775                        embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
1776                        source_key: Some("custom_embedding_key".to_string()),
1777                        hnsw: None,
1778                        spann: None,
1779                    },
1780                }),
1781            }),
1782            ..Default::default()
1783        };
1784        user_schema
1785            .keys
1786            .insert(EMBEDDING_KEY.to_string(), embedding_override);
1787
1788        let result = InternalSchema::reconcile_with_defaults(Some(user_schema)).unwrap();
1789
1790        let embedding_config = result.keys.get(EMBEDDING_KEY).unwrap();
1791        let vector_config = &embedding_config
1792            .float_list
1793            .as_ref()
1794            .unwrap()
1795            .vector_index
1796            .as_ref()
1797            .unwrap();
1798
1799        // Check user overrides took precedence
1800        assert!(!vector_config.enabled);
1801        assert_eq!(vector_config.config.space, Some(Space::Ip));
1802        assert_eq!(
1803            vector_config.config.source_key,
1804            Some("custom_embedding_key".to_string())
1805        );
1806    }
1807
1808    #[test]
1809    fn test_ensure_key_from_metadata_no_changes_for_existing_key() {
1810        let mut schema = InternalSchema::new_default(KnnIndex::Hnsw);
1811        let before = schema.clone();
1812        let modified = schema.ensure_key_from_metadata(DOCUMENT_KEY, MetadataValueType::Str);
1813        assert!(!modified);
1814        assert_eq!(schema, before);
1815    }
1816
1817    #[test]
1818    fn test_ensure_key_from_metadata_populates_new_key_with_default_value_type() {
1819        let mut schema = InternalSchema::new_default(KnnIndex::Hnsw);
1820        assert!(!schema.keys.contains_key("custom_field"));
1821
1822        let modified = schema.ensure_key_from_metadata("custom_field", MetadataValueType::Bool);
1823
1824        assert!(modified);
1825        let entry = schema
1826            .keys
1827            .get("custom_field")
1828            .expect("expected new key override to be inserted");
1829        assert_eq!(entry.boolean, schema.defaults.boolean);
1830        assert!(entry.string.is_none());
1831        assert!(entry.int.is_none());
1832        assert!(entry.float.is_none());
1833        assert!(entry.float_list.is_none());
1834        assert!(entry.sparse_vector.is_none());
1835    }
1836
1837    #[test]
1838    fn test_ensure_key_from_metadata_adds_missing_value_type_to_existing_key() {
1839        let mut schema = InternalSchema::new_default(KnnIndex::Hnsw);
1840        let initial_len = schema.keys.len();
1841        schema.keys.insert(
1842            "custom_field".to_string(),
1843            ValueTypes {
1844                string: schema.defaults.string.clone(),
1845                ..Default::default()
1846            },
1847        );
1848
1849        let modified = schema.ensure_key_from_metadata("custom_field", MetadataValueType::Bool);
1850
1851        assert!(modified);
1852        assert_eq!(schema.keys.len(), initial_len + 1);
1853        let entry = schema
1854            .keys
1855            .get("custom_field")
1856            .expect("expected key override to exist after ensure call");
1857        assert!(entry.string.is_some());
1858        assert_eq!(entry.boolean, schema.defaults.boolean);
1859    }
1860
1861    #[test]
1862    fn test_is_knn_key_indexing_enabled_sparse_disabled_errors() {
1863        let schema = InternalSchema::new_default(KnnIndex::Spann);
1864        let result = schema.is_knn_key_indexing_enabled(
1865            "custom_sparse",
1866            &QueryVector::Sparse(SparseVector::new(vec![0_u32], vec![1.0_f32])),
1867        );
1868
1869        let err = result.expect_err("expected indexing disabled error");
1870        match err {
1871            FilterValidationError::IndexingDisabled { key, value_type } => {
1872                assert_eq!(key, "custom_sparse");
1873                assert_eq!(value_type, crate::metadata::MetadataValueType::SparseVector);
1874            }
1875            other => panic!("unexpected error variant: {other:?}"),
1876        }
1877    }
1878
1879    #[test]
1880    fn test_is_knn_key_indexing_enabled_sparse_enabled_succeeds() {
1881        let mut schema = InternalSchema::new_default(KnnIndex::Spann);
1882        schema.keys.insert(
1883            "sparse_enabled".to_string(),
1884            ValueTypes {
1885                sparse_vector: Some(SparseVectorValueType {
1886                    sparse_vector_index: Some(SparseVectorIndexType {
1887                        enabled: true,
1888                        config: SparseVectorIndexConfig {
1889                            embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
1890                            source_key: None,
1891                            bm25: None,
1892                        },
1893                    }),
1894                }),
1895                ..Default::default()
1896            },
1897        );
1898
1899        let result = schema.is_knn_key_indexing_enabled(
1900            "sparse_enabled",
1901            &QueryVector::Sparse(SparseVector::new(vec![0_u32], vec![1.0_f32])),
1902        );
1903
1904        assert!(result.is_ok());
1905    }
1906
1907    #[test]
1908    fn test_is_knn_key_indexing_enabled_dense_succeeds() {
1909        let schema = InternalSchema::new_default(KnnIndex::Spann);
1910        let result = schema.is_knn_key_indexing_enabled(
1911            EMBEDDING_KEY,
1912            &QueryVector::Dense(vec![0.1_f32, 0.2_f32]),
1913        );
1914
1915        assert!(result.is_ok());
1916    }
1917
1918    #[test]
1919    fn test_merge_hnsw_configs_field_level() {
1920        // Test field-level merging for HNSW configurations
1921        let default_hnsw = HnswIndexConfig {
1922            ef_construction: Some(200),
1923            max_neighbors: Some(16),
1924            ef_search: Some(10),
1925            num_threads: Some(4),
1926            batch_size: Some(100),
1927            sync_threshold: Some(1000),
1928            resize_factor: Some(1.2),
1929        };
1930
1931        let user_hnsw = HnswIndexConfig {
1932            ef_construction: Some(300), // Override
1933            max_neighbors: None,        // Will use default
1934            ef_search: Some(20),        // Override
1935            num_threads: None,          // Will use default
1936            batch_size: None,           // Will use default
1937            sync_threshold: Some(2000), // Override
1938            resize_factor: None,        // Will use default
1939        };
1940
1941        let result =
1942            InternalSchema::merge_hnsw_configs(Some(&default_hnsw), Some(&user_hnsw)).unwrap();
1943
1944        // Check user overrides
1945        assert_eq!(result.ef_construction, Some(300));
1946        assert_eq!(result.ef_search, Some(20));
1947        assert_eq!(result.sync_threshold, Some(2000));
1948
1949        // Check defaults preserved
1950        assert_eq!(result.max_neighbors, Some(16));
1951        assert_eq!(result.num_threads, Some(4));
1952        assert_eq!(result.batch_size, Some(100));
1953        assert_eq!(result.resize_factor, Some(1.2));
1954    }
1955
1956    #[test]
1957    fn test_merge_spann_configs_field_level() {
1958        // Test field-level merging for SPANN configurations
1959        let default_spann = SpannIndexConfig {
1960            search_nprobe: Some(10),
1961            search_rng_factor: Some(2.0),
1962            search_rng_epsilon: Some(0.1),
1963            nreplica_count: Some(3),
1964            write_rng_factor: Some(1.5),
1965            write_rng_epsilon: Some(0.05),
1966            split_threshold: Some(1000),
1967            num_samples_kmeans: Some(100),
1968            initial_lambda: Some(0.5),
1969            reassign_neighbor_count: Some(50),
1970            merge_threshold: Some(500),
1971            num_centers_to_merge_to: Some(10),
1972            write_nprobe: Some(5),
1973            ef_construction: Some(200),
1974            ef_search: Some(10),
1975            max_neighbors: Some(16),
1976        };
1977
1978        let user_spann = SpannIndexConfig {
1979            search_nprobe: Some(20),       // Override
1980            search_rng_factor: None,       // Will use default
1981            search_rng_epsilon: Some(0.2), // Override
1982            nreplica_count: None,          // Will use default
1983            write_rng_factor: None,
1984            write_rng_epsilon: None,
1985            split_threshold: Some(2000), // Override
1986            num_samples_kmeans: None,
1987            initial_lambda: None,
1988            reassign_neighbor_count: None,
1989            merge_threshold: None,
1990            num_centers_to_merge_to: None,
1991            write_nprobe: None,
1992            ef_construction: None,
1993            ef_search: None,
1994            max_neighbors: None,
1995        };
1996
1997        let result =
1998            InternalSchema::merge_spann_configs(Some(&default_spann), Some(&user_spann)).unwrap();
1999
2000        // Check user overrides
2001        assert_eq!(result.search_nprobe, Some(20));
2002        assert_eq!(result.search_rng_epsilon, Some(0.2));
2003        assert_eq!(result.split_threshold, Some(2000));
2004
2005        // Check defaults preserved
2006        assert_eq!(result.search_rng_factor, Some(2.0));
2007        assert_eq!(result.nreplica_count, Some(3));
2008        assert_eq!(result.initial_lambda, Some(0.5));
2009    }
2010
2011    #[test]
2012    fn test_spann_index_config_into_internal_configuration() {
2013        let config = SpannIndexConfig {
2014            search_nprobe: Some(33),
2015            search_rng_factor: Some(1.2),
2016            search_rng_epsilon: None,
2017            nreplica_count: None,
2018            write_rng_factor: Some(1.5),
2019            write_rng_epsilon: None,
2020            split_threshold: Some(75),
2021            num_samples_kmeans: None,
2022            initial_lambda: Some(0.9),
2023            reassign_neighbor_count: Some(40),
2024            merge_threshold: None,
2025            num_centers_to_merge_to: Some(4),
2026            write_nprobe: Some(60),
2027            ef_construction: Some(180),
2028            ef_search: Some(170),
2029            max_neighbors: Some(32),
2030        };
2031
2032        let with_space = config
2033            .clone()
2034            .into_internal_configuration(Some(Space::Cosine));
2035        assert_eq!(with_space.space, Space::Cosine);
2036        assert_eq!(with_space.search_nprobe, 33);
2037        assert_eq!(with_space.search_rng_factor, 1.2);
2038        assert_eq!(with_space.search_rng_epsilon, default_search_rng_epsilon());
2039        assert_eq!(with_space.write_rng_factor, 1.5);
2040        assert_eq!(with_space.write_nprobe, 60);
2041        assert_eq!(with_space.ef_construction, 180);
2042        assert_eq!(with_space.ef_search, 170);
2043        assert_eq!(with_space.max_neighbors, 32);
2044        assert_eq!(with_space.merge_threshold, default_merge_threshold());
2045
2046        let default_space_config = config.into_internal_configuration(None);
2047        assert_eq!(default_space_config.space, default_space());
2048    }
2049
2050    #[test]
2051    fn test_merge_string_type_combinations() {
2052        // Test all combinations of default and user StringValueType
2053
2054        // Both Some - should merge
2055        let default = StringValueType {
2056            string_inverted_index: Some(StringInvertedIndexType {
2057                enabled: true,
2058                config: StringInvertedIndexConfig {},
2059            }),
2060            fts_index: Some(FtsIndexType {
2061                enabled: false,
2062                config: FtsIndexConfig {},
2063            }),
2064        };
2065
2066        let user = StringValueType {
2067            string_inverted_index: Some(StringInvertedIndexType {
2068                enabled: false, // Override
2069                config: StringInvertedIndexConfig {},
2070            }),
2071            fts_index: None, // Will use default
2072        };
2073
2074        let result = InternalSchema::merge_string_type(Some(&default), Some(&user))
2075            .unwrap()
2076            .unwrap();
2077        assert!(!result.string_inverted_index.as_ref().unwrap().enabled); // User override
2078        assert!(!result.fts_index.as_ref().unwrap().enabled); // Default preserved
2079
2080        // Default Some, User None - should return default
2081        let result = InternalSchema::merge_string_type(Some(&default), None)
2082            .unwrap()
2083            .unwrap();
2084        assert!(result.string_inverted_index.as_ref().unwrap().enabled);
2085
2086        // Default None, User Some - should return user
2087        let result = InternalSchema::merge_string_type(None, Some(&user))
2088            .unwrap()
2089            .unwrap();
2090        assert!(!result.string_inverted_index.as_ref().unwrap().enabled);
2091
2092        // Both None - should return None
2093        let result = InternalSchema::merge_string_type(None, None).unwrap();
2094        assert!(result.is_none());
2095    }
2096
2097    #[test]
2098    fn test_merge_vector_index_config_comprehensive() {
2099        // Test comprehensive vector index config merging
2100        let default_config = VectorIndexConfig {
2101            space: Some(Space::Cosine),
2102            embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
2103            source_key: Some("default_key".to_string()),
2104            hnsw: Some(HnswIndexConfig {
2105                ef_construction: Some(200),
2106                max_neighbors: Some(16),
2107                ef_search: Some(10),
2108                num_threads: Some(4),
2109                batch_size: Some(100),
2110                sync_threshold: Some(1000),
2111                resize_factor: Some(1.2),
2112            }),
2113            spann: None,
2114        };
2115
2116        let user_config = VectorIndexConfig {
2117            space: Some(Space::L2),                   // Override
2118            embedding_function: None,                 // Will use default
2119            source_key: Some("user_key".to_string()), // Override
2120            hnsw: Some(HnswIndexConfig {
2121                ef_construction: Some(300), // Override
2122                max_neighbors: None,        // Will use default
2123                ef_search: None,            // Will use default
2124                num_threads: None,
2125                batch_size: None,
2126                sync_threshold: None,
2127                resize_factor: None,
2128            }),
2129            spann: Some(SpannIndexConfig {
2130                search_nprobe: Some(15),
2131                search_rng_factor: None,
2132                search_rng_epsilon: None,
2133                nreplica_count: None,
2134                write_rng_factor: None,
2135                write_rng_epsilon: None,
2136                split_threshold: None,
2137                num_samples_kmeans: None,
2138                initial_lambda: None,
2139                reassign_neighbor_count: None,
2140                merge_threshold: None,
2141                num_centers_to_merge_to: None,
2142                write_nprobe: None,
2143                ef_construction: None,
2144                ef_search: None,
2145                max_neighbors: None,
2146            }), // Add SPANN config
2147        };
2148
2149        let result =
2150            InternalSchema::merge_vector_index_config(&default_config, &user_config).unwrap();
2151
2152        // Check field-level merging
2153        assert_eq!(result.space, Some(Space::L2)); // User override
2154        assert_eq!(
2155            result.embedding_function,
2156            Some(EmbeddingFunctionConfiguration::Legacy)
2157        ); // Default preserved
2158        assert_eq!(result.source_key, Some("user_key".to_string())); // User override
2159
2160        // Check HNSW merging
2161        assert_eq!(result.hnsw.as_ref().unwrap().ef_construction, Some(300)); // User override
2162        assert_eq!(result.hnsw.as_ref().unwrap().max_neighbors, Some(16)); // Default preserved
2163
2164        // Check SPANN was added from user
2165        assert!(result.spann.is_some());
2166        assert_eq!(result.spann.as_ref().unwrap().search_nprobe, Some(15));
2167    }
2168
2169    #[test]
2170    fn test_merge_sparse_vector_index_config() {
2171        // Test sparse vector index config merging
2172        let default_config = SparseVectorIndexConfig {
2173            embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
2174            source_key: Some("default_sparse_key".to_string()),
2175            bm25: None,
2176        };
2177
2178        let user_config = SparseVectorIndexConfig {
2179            embedding_function: None,                        // Will use default
2180            source_key: Some("user_sparse_key".to_string()), // Override
2181            bm25: None,
2182        };
2183
2184        let result =
2185            InternalSchema::merge_sparse_vector_index_config(&default_config, &user_config)
2186                .unwrap();
2187
2188        // Check user override
2189        assert_eq!(result.source_key, Some("user_sparse_key".to_string()));
2190        // Check default preserved
2191        assert_eq!(
2192            result.embedding_function,
2193            Some(EmbeddingFunctionConfiguration::Legacy)
2194        );
2195    }
2196
2197    #[test]
2198    fn test_complex_nested_merging_scenario() {
2199        // Test a complex scenario with multiple levels of merging
2200        let mut user_schema = InternalSchema {
2201            defaults: ValueTypes::default(),
2202            keys: HashMap::new(),
2203        };
2204
2205        // Set up complex user defaults
2206        user_schema.defaults.string = Some(StringValueType {
2207            string_inverted_index: Some(StringInvertedIndexType {
2208                enabled: false,
2209                config: StringInvertedIndexConfig {},
2210            }),
2211            fts_index: Some(FtsIndexType {
2212                enabled: true,
2213                config: FtsIndexConfig {},
2214            }),
2215        });
2216
2217        user_schema.defaults.float_list = Some(FloatListValueType {
2218            vector_index: Some(VectorIndexType {
2219                enabled: true,
2220                config: VectorIndexConfig {
2221                    space: Some(Space::Ip),
2222                    embedding_function: None, // Will use default
2223                    source_key: Some("custom_vector_key".to_string()),
2224                    hnsw: Some(HnswIndexConfig {
2225                        ef_construction: Some(400),
2226                        max_neighbors: Some(32),
2227                        ef_search: None, // Will use default
2228                        num_threads: None,
2229                        batch_size: None,
2230                        sync_threshold: None,
2231                        resize_factor: None,
2232                    }),
2233                    spann: None,
2234                },
2235            }),
2236        });
2237
2238        // Set up key overrides
2239        let custom_key_override = ValueTypes {
2240            string: Some(StringValueType {
2241                fts_index: Some(FtsIndexType {
2242                    enabled: true,
2243                    config: FtsIndexConfig {},
2244                }),
2245                string_inverted_index: None,
2246            }),
2247            ..Default::default()
2248        };
2249        user_schema
2250            .keys
2251            .insert("custom_field".to_string(), custom_key_override);
2252
2253        // Use HNSW defaults for this test so we have HNSW config to merge with
2254        let result = {
2255            let default_schema = InternalSchema::new_default(KnnIndex::Hnsw);
2256            let merged_defaults =
2257                InternalSchema::merge_value_types(&default_schema.defaults, &user_schema.defaults)
2258                    .unwrap();
2259            let mut merged_keys = default_schema.keys.clone();
2260            for (key, user_value_types) in user_schema.keys {
2261                if let Some(default_value_types) = merged_keys.get(&key) {
2262                    let merged_value_types =
2263                        InternalSchema::merge_value_types(default_value_types, &user_value_types)
2264                            .unwrap();
2265                    merged_keys.insert(key, merged_value_types);
2266                } else {
2267                    merged_keys.insert(key, user_value_types);
2268                }
2269            }
2270            InternalSchema {
2271                defaults: merged_defaults,
2272                keys: merged_keys,
2273            }
2274        };
2275
2276        // Verify complex merging worked correctly
2277
2278        // Check defaults merging
2279        assert!(
2280            !result
2281                .defaults
2282                .string
2283                .as_ref()
2284                .unwrap()
2285                .string_inverted_index
2286                .as_ref()
2287                .unwrap()
2288                .enabled
2289        );
2290        assert!(
2291            result
2292                .defaults
2293                .string
2294                .as_ref()
2295                .unwrap()
2296                .fts_index
2297                .as_ref()
2298                .unwrap()
2299                .enabled
2300        );
2301
2302        let vector_config = &result
2303            .defaults
2304            .float_list
2305            .as_ref()
2306            .unwrap()
2307            .vector_index
2308            .as_ref()
2309            .unwrap()
2310            .config;
2311        assert_eq!(vector_config.space, Some(Space::Ip));
2312        assert_eq!(
2313            vector_config.embedding_function,
2314            Some(EmbeddingFunctionConfiguration::Legacy)
2315        ); // Default preserved
2316        assert_eq!(
2317            vector_config.source_key,
2318            Some("custom_vector_key".to_string())
2319        );
2320        assert_eq!(
2321            vector_config.hnsw.as_ref().unwrap().ef_construction,
2322            Some(400)
2323        );
2324        assert_eq!(vector_config.hnsw.as_ref().unwrap().max_neighbors, Some(32));
2325        assert_eq!(
2326            vector_config.hnsw.as_ref().unwrap().ef_search,
2327            Some(default_search_ef())
2328        ); // Default preserved
2329
2330        // Check key overrides
2331        assert!(result.keys.contains_key(EMBEDDING_KEY)); // Default preserved
2332        assert!(result.keys.contains_key(DOCUMENT_KEY)); // Default preserved
2333        assert!(result.keys.contains_key("custom_field")); // User added
2334
2335        let custom_override = result.keys.get("custom_field").unwrap();
2336        assert!(
2337            custom_override
2338                .string
2339                .as_ref()
2340                .unwrap()
2341                .fts_index
2342                .as_ref()
2343                .unwrap()
2344                .enabled
2345        );
2346        assert!(custom_override
2347            .string
2348            .as_ref()
2349            .unwrap()
2350            .string_inverted_index
2351            .is_none());
2352    }
2353
2354    #[test]
2355    fn test_reconcile_with_collection_config_default_config() {
2356        // Test that when collection config is default, schema is returned as-is
2357        let schema = InternalSchema::new_default(KnnIndex::Hnsw);
2358        let collection_config = InternalCollectionConfiguration::default_hnsw();
2359
2360        let result =
2361            InternalSchema::reconcile_with_collection_config(schema.clone(), collection_config)
2362                .unwrap();
2363        assert_eq!(result, schema);
2364    }
2365
2366    #[test]
2367    fn test_reconcile_with_collection_config_both_non_default() {
2368        // Test that when both schema and collection config are non-default, it returns an error
2369        let mut schema = InternalSchema::new_default(KnnIndex::Hnsw);
2370        schema.defaults.string = Some(StringValueType {
2371            fts_index: Some(FtsIndexType {
2372                enabled: true,
2373                config: FtsIndexConfig {},
2374            }),
2375            string_inverted_index: None,
2376        });
2377
2378        let mut collection_config = InternalCollectionConfiguration::default_hnsw();
2379        // Make collection config non-default by changing a parameter
2380        if let VectorIndexConfiguration::Hnsw(ref mut hnsw_config) = collection_config.vector_index
2381        {
2382            hnsw_config.ef_construction = 500; // Non-default value
2383        }
2384
2385        let result = InternalSchema::reconcile_with_collection_config(schema, collection_config);
2386        assert!(result.is_err());
2387        assert_eq!(
2388            result.unwrap_err(),
2389            "Cannot set both collection config and schema at the same time"
2390        );
2391    }
2392
2393    #[test]
2394    fn test_reconcile_with_collection_config_hnsw_override() {
2395        // Test that non-default HNSW collection config overrides default schema
2396        let schema = InternalSchema::new_default(KnnIndex::Hnsw); // Use actual default schema
2397
2398        let collection_config = InternalCollectionConfiguration {
2399            vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration {
2400                ef_construction: 300,
2401                max_neighbors: 32,
2402                ef_search: 50,
2403                num_threads: 8,
2404                batch_size: 200,
2405                sync_threshold: 2000,
2406                resize_factor: 1.5,
2407                space: Space::L2,
2408            }),
2409            embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
2410        };
2411
2412        let result =
2413            InternalSchema::reconcile_with_collection_config(schema, collection_config).unwrap();
2414
2415        // Check that #embedding key override was created with the collection config settings
2416        let embedding_override = result.keys.get(EMBEDDING_KEY).unwrap();
2417        let vector_index = embedding_override
2418            .float_list
2419            .as_ref()
2420            .unwrap()
2421            .vector_index
2422            .as_ref()
2423            .unwrap();
2424
2425        assert!(vector_index.enabled);
2426        assert_eq!(vector_index.config.space, Some(Space::L2));
2427        assert_eq!(
2428            vector_index.config.embedding_function,
2429            Some(EmbeddingFunctionConfiguration::Legacy)
2430        );
2431        assert_eq!(
2432            vector_index.config.source_key,
2433            Some(DOCUMENT_KEY.to_string())
2434        );
2435
2436        let hnsw_config = vector_index.config.hnsw.as_ref().unwrap();
2437        assert_eq!(hnsw_config.ef_construction, Some(300));
2438        assert_eq!(hnsw_config.max_neighbors, Some(32));
2439        assert_eq!(hnsw_config.ef_search, Some(50));
2440        assert_eq!(hnsw_config.num_threads, Some(8));
2441        assert_eq!(hnsw_config.batch_size, Some(200));
2442        assert_eq!(hnsw_config.sync_threshold, Some(2000));
2443        assert_eq!(hnsw_config.resize_factor, Some(1.5));
2444
2445        assert!(vector_index.config.spann.is_none());
2446    }
2447
2448    #[test]
2449    fn test_reconcile_with_collection_config_spann_override() {
2450        // Test that non-default SPANN collection config overrides default schema
2451        let schema = InternalSchema::new_default(KnnIndex::Spann); // Use actual default schema
2452
2453        let collection_config = InternalCollectionConfiguration {
2454            vector_index: VectorIndexConfiguration::Spann(InternalSpannConfiguration {
2455                search_nprobe: 20,
2456                search_rng_factor: 3.0,
2457                search_rng_epsilon: 0.2,
2458                nreplica_count: 5,
2459                write_rng_factor: 2.0,
2460                write_rng_epsilon: 0.1,
2461                split_threshold: 2000,
2462                num_samples_kmeans: 200,
2463                initial_lambda: 0.8,
2464                reassign_neighbor_count: 100,
2465                merge_threshold: 800,
2466                num_centers_to_merge_to: 20,
2467                write_nprobe: 10,
2468                ef_construction: 400,
2469                ef_search: 60,
2470                max_neighbors: 24,
2471                space: Space::Cosine,
2472            }),
2473            embedding_function: None,
2474        };
2475
2476        let result =
2477            InternalSchema::reconcile_with_collection_config(schema, collection_config).unwrap();
2478
2479        // Check that #embedding key override was created with the collection config settings
2480        let embedding_override = result.keys.get(EMBEDDING_KEY).unwrap();
2481        let vector_index = embedding_override
2482            .float_list
2483            .as_ref()
2484            .unwrap()
2485            .vector_index
2486            .as_ref()
2487            .unwrap();
2488
2489        assert!(vector_index.enabled);
2490        assert_eq!(vector_index.config.space, Some(Space::Cosine));
2491        assert_eq!(vector_index.config.embedding_function, None);
2492        assert_eq!(
2493            vector_index.config.source_key,
2494            Some(DOCUMENT_KEY.to_string())
2495        );
2496
2497        assert!(vector_index.config.hnsw.is_none());
2498
2499        let spann_config = vector_index.config.spann.as_ref().unwrap();
2500        assert_eq!(spann_config.search_nprobe, Some(20));
2501        assert_eq!(spann_config.search_rng_factor, Some(3.0));
2502        assert_eq!(spann_config.search_rng_epsilon, Some(0.2));
2503        assert_eq!(spann_config.nreplica_count, Some(5));
2504        assert_eq!(spann_config.write_rng_factor, Some(2.0));
2505        assert_eq!(spann_config.write_rng_epsilon, Some(0.1));
2506        assert_eq!(spann_config.split_threshold, Some(2000));
2507        assert_eq!(spann_config.num_samples_kmeans, Some(200));
2508        assert_eq!(spann_config.initial_lambda, Some(0.8));
2509        assert_eq!(spann_config.reassign_neighbor_count, Some(100));
2510        assert_eq!(spann_config.merge_threshold, Some(800));
2511        assert_eq!(spann_config.num_centers_to_merge_to, Some(20));
2512        assert_eq!(spann_config.write_nprobe, Some(10));
2513        assert_eq!(spann_config.ef_construction, Some(400));
2514        assert_eq!(spann_config.ef_search, Some(60));
2515        assert_eq!(spann_config.max_neighbors, Some(24));
2516    }
2517
2518    #[test]
2519    fn test_reconcile_with_collection_config_updates_both_defaults_and_embedding() {
2520        // Test that collection config updates BOTH defaults.float_list.vector_index
2521        // AND keys["embedding"].float_list.vector_index
2522        let schema = InternalSchema::new_default(KnnIndex::Hnsw);
2523
2524        let collection_config = InternalCollectionConfiguration {
2525            vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration {
2526                ef_construction: 300,
2527                max_neighbors: 32,
2528                ef_search: 50,
2529                num_threads: 8,
2530                batch_size: 200,
2531                sync_threshold: 2000,
2532                resize_factor: 1.5,
2533                space: Space::L2,
2534            }),
2535            embedding_function: Some(EmbeddingFunctionConfiguration::Legacy),
2536        };
2537
2538        let result =
2539            InternalSchema::reconcile_with_collection_config(schema, collection_config).unwrap();
2540
2541        // Check that defaults.float_list.vector_index was updated
2542        let defaults_vector_index = result
2543            .defaults
2544            .float_list
2545            .as_ref()
2546            .unwrap()
2547            .vector_index
2548            .as_ref()
2549            .unwrap();
2550
2551        // Should be disabled in defaults (template for new keys)
2552        assert!(!defaults_vector_index.enabled);
2553        // But config should be updated
2554        assert_eq!(defaults_vector_index.config.space, Some(Space::L2));
2555        assert_eq!(
2556            defaults_vector_index.config.embedding_function,
2557            Some(EmbeddingFunctionConfiguration::Legacy)
2558        );
2559        assert_eq!(
2560            defaults_vector_index.config.source_key,
2561            Some(DOCUMENT_KEY.to_string())
2562        );
2563        let defaults_hnsw = defaults_vector_index.config.hnsw.as_ref().unwrap();
2564        assert_eq!(defaults_hnsw.ef_construction, Some(300));
2565        assert_eq!(defaults_hnsw.max_neighbors, Some(32));
2566
2567        // Check that #embedding key override was also updated
2568        let embedding_override = result.keys.get(EMBEDDING_KEY).unwrap();
2569        let embedding_vector_index = embedding_override
2570            .float_list
2571            .as_ref()
2572            .unwrap()
2573            .vector_index
2574            .as_ref()
2575            .unwrap();
2576
2577        // Should be enabled on #embedding
2578        assert!(embedding_vector_index.enabled);
2579        // Config should match defaults
2580        assert_eq!(embedding_vector_index.config.space, Some(Space::L2));
2581        assert_eq!(
2582            embedding_vector_index.config.embedding_function,
2583            Some(EmbeddingFunctionConfiguration::Legacy)
2584        );
2585        assert_eq!(
2586            embedding_vector_index.config.source_key,
2587            Some(DOCUMENT_KEY.to_string())
2588        );
2589        let embedding_hnsw = embedding_vector_index.config.hnsw.as_ref().unwrap();
2590        assert_eq!(embedding_hnsw.ef_construction, Some(300));
2591        assert_eq!(embedding_hnsw.max_neighbors, Some(32));
2592    }
2593
2594    #[test]
2595    fn test_is_schema_default() {
2596        // Test that actual default schemas are correctly identified
2597        let default_hnsw_schema = InternalSchema::new_default(KnnIndex::Hnsw);
2598        assert!(InternalSchema::is_schema_default(&default_hnsw_schema));
2599
2600        let default_spann_schema = InternalSchema::new_default(KnnIndex::Spann);
2601        assert!(InternalSchema::is_schema_default(&default_spann_schema));
2602
2603        // Test that an empty schema is NOT considered default (since it doesn't match new_default structure)
2604        let empty_schema = InternalSchema {
2605            defaults: ValueTypes::default(),
2606            keys: HashMap::new(),
2607        };
2608        assert!(!InternalSchema::is_schema_default(&empty_schema));
2609
2610        // Test that a modified default schema is not considered default
2611        let mut modified_schema = InternalSchema::new_default(KnnIndex::Hnsw);
2612        // Make a clear modification - change the string inverted index enabled state
2613        if let Some(ref mut string_type) = modified_schema.defaults.string {
2614            if let Some(ref mut string_inverted) = string_type.string_inverted_index {
2615                string_inverted.enabled = false; // Default is true, so this should make it non-default
2616            }
2617        }
2618        assert!(!InternalSchema::is_schema_default(&modified_schema));
2619
2620        // Test that schema with additional key overrides is not default
2621        let mut schema_with_extra_overrides = InternalSchema::new_default(KnnIndex::Hnsw);
2622        schema_with_extra_overrides
2623            .keys
2624            .insert("custom_key".to_string(), ValueTypes::default());
2625        assert!(!InternalSchema::is_schema_default(
2626            &schema_with_extra_overrides
2627        ));
2628    }
2629
2630    #[test]
2631    fn test_add_merges_keys_by_value_type() {
2632        let mut schema_a = InternalSchema::new_default(KnnIndex::Hnsw);
2633        let mut schema_b = InternalSchema::new_default(KnnIndex::Hnsw);
2634
2635        let string_override = ValueTypes {
2636            string: Some(StringValueType {
2637                string_inverted_index: Some(StringInvertedIndexType {
2638                    enabled: true,
2639                    config: StringInvertedIndexConfig {},
2640                }),
2641                fts_index: None,
2642            }),
2643            ..Default::default()
2644        };
2645        schema_a
2646            .keys
2647            .insert("custom_field".to_string(), string_override);
2648
2649        let float_override = ValueTypes {
2650            float: Some(FloatValueType {
2651                float_inverted_index: Some(FloatInvertedIndexType {
2652                    enabled: true,
2653                    config: FloatInvertedIndexConfig {},
2654                }),
2655            }),
2656            ..Default::default()
2657        };
2658        schema_b
2659            .keys
2660            .insert("custom_field".to_string(), float_override);
2661
2662        let merged = schema_a.merge(&schema_b).unwrap();
2663        let merged_override = merged.keys.get("custom_field").unwrap();
2664
2665        assert!(merged_override.string.is_some());
2666        assert!(merged_override.float.is_some());
2667        assert!(
2668            merged_override
2669                .string
2670                .as_ref()
2671                .unwrap()
2672                .string_inverted_index
2673                .as_ref()
2674                .unwrap()
2675                .enabled
2676        );
2677        assert!(
2678            merged_override
2679                .float
2680                .as_ref()
2681                .unwrap()
2682                .float_inverted_index
2683                .as_ref()
2684                .unwrap()
2685                .enabled
2686        );
2687    }
2688
2689    #[test]
2690    fn test_add_rejects_different_defaults() {
2691        let schema_a = InternalSchema::new_default(KnnIndex::Hnsw);
2692        let mut schema_b = InternalSchema::new_default(KnnIndex::Hnsw);
2693
2694        if let Some(string_type) = schema_b.defaults.string.as_mut() {
2695            if let Some(string_index) = string_type.string_inverted_index.as_mut() {
2696                string_index.enabled = false;
2697            }
2698        }
2699
2700        let err = schema_a.merge(&schema_b).unwrap_err();
2701        match err {
2702            SchemaError::InvalidSchema { reason } => {
2703                assert_eq!(reason, "Cannot merge schemas with differing defaults")
2704            }
2705            _ => panic!("Expected InvalidSchema error"),
2706        }
2707    }
2708
2709    #[test]
2710    fn test_add_detects_conflicting_value_type_configuration() {
2711        let mut schema_a = InternalSchema::new_default(KnnIndex::Hnsw);
2712        let mut schema_b = InternalSchema::new_default(KnnIndex::Hnsw);
2713
2714        let string_override_enabled = ValueTypes {
2715            string: Some(StringValueType {
2716                string_inverted_index: Some(StringInvertedIndexType {
2717                    enabled: true,
2718                    config: StringInvertedIndexConfig {},
2719                }),
2720                fts_index: None,
2721            }),
2722            ..Default::default()
2723        };
2724        schema_a
2725            .keys
2726            .insert("custom_field".to_string(), string_override_enabled);
2727
2728        let string_override_disabled = ValueTypes {
2729            string: Some(StringValueType {
2730                string_inverted_index: Some(StringInvertedIndexType {
2731                    enabled: false,
2732                    config: StringInvertedIndexConfig {},
2733                }),
2734                fts_index: None,
2735            }),
2736            ..Default::default()
2737        };
2738        schema_b
2739            .keys
2740            .insert("custom_field".to_string(), string_override_disabled);
2741
2742        let err = schema_a.merge(&schema_b).unwrap_err();
2743        match err {
2744            SchemaError::InvalidSchema { reason } => {
2745                assert!(reason.contains("Conflicting configuration"));
2746            }
2747            _ => panic!("Expected InvalidSchema error"),
2748        }
2749    }
2750
2751    // TODO(Sanket): Remove this test once deployed
2752    #[test]
2753    fn test_backward_compatibility_aliases() {
2754        // Test that old format with # and $ prefixes and key_overrides can be deserialized
2755        let old_format_json = r###"{
2756            "defaults": {
2757                "#string": {
2758                    "$fts_index": {
2759                        "enabled": true,
2760                        "config": {}
2761                    }
2762                },
2763                "#int": {
2764                    "$int_inverted_index": {
2765                        "enabled": true,
2766                        "config": {}
2767                    }
2768                },
2769                "#float_list": {
2770                    "$vector_index": {
2771                        "enabled": true,
2772                        "config": {
2773                            "spann": {
2774                                "search_nprobe": 10
2775                            }
2776                        }
2777                    }
2778                }
2779            },
2780            "key_overrides": {
2781                "#document": {
2782                    "#string": {
2783                        "$fts_index": {
2784                            "enabled": false,
2785                            "config": {}
2786                        }
2787                    }
2788                }
2789            }
2790        }"###;
2791
2792        let schema_from_old: InternalSchema = serde_json::from_str(old_format_json).unwrap();
2793
2794        // Test that new format without prefixes and keys can be deserialized
2795        let new_format_json = r###"{
2796            "defaults": {
2797                "string": {
2798                    "fts_index": {
2799                        "enabled": true,
2800                        "config": {}
2801                    }
2802                },
2803                "int": {
2804                    "int_inverted_index": {
2805                        "enabled": true,
2806                        "config": {}
2807                    }
2808                },
2809                "float_list": {
2810                    "vector_index": {
2811                        "enabled": true,
2812                        "config": {
2813                            "spann": {
2814                                "search_nprobe": 10
2815                            }
2816                        }
2817                    }
2818                }
2819            },
2820            "keys": {
2821                "#document": {
2822                    "string": {
2823                        "fts_index": {
2824                            "enabled": false,
2825                            "config": {}
2826                        }
2827                    }
2828                }
2829            }
2830        }"###;
2831
2832        let schema_from_new: InternalSchema = serde_json::from_str(new_format_json).unwrap();
2833
2834        // Both should deserialize to the same structure
2835        assert_eq!(schema_from_old, schema_from_new);
2836
2837        // Verify the deserialized content is correct
2838        assert!(schema_from_old.defaults.string.is_some());
2839        assert!(schema_from_old
2840            .defaults
2841            .string
2842            .as_ref()
2843            .unwrap()
2844            .fts_index
2845            .is_some());
2846        assert!(
2847            schema_from_old
2848                .defaults
2849                .string
2850                .as_ref()
2851                .unwrap()
2852                .fts_index
2853                .as_ref()
2854                .unwrap()
2855                .enabled
2856        );
2857
2858        assert!(schema_from_old.defaults.int.is_some());
2859        assert!(schema_from_old
2860            .defaults
2861            .int
2862            .as_ref()
2863            .unwrap()
2864            .int_inverted_index
2865            .is_some());
2866
2867        assert!(schema_from_old.defaults.float_list.is_some());
2868        assert!(schema_from_old
2869            .defaults
2870            .float_list
2871            .as_ref()
2872            .unwrap()
2873            .vector_index
2874            .is_some());
2875
2876        assert!(schema_from_old.keys.contains_key(DOCUMENT_KEY));
2877        let doc_override = schema_from_old.keys.get(DOCUMENT_KEY).unwrap();
2878        assert!(doc_override.string.is_some());
2879        assert!(
2880            !doc_override
2881                .string
2882                .as_ref()
2883                .unwrap()
2884                .fts_index
2885                .as_ref()
2886                .unwrap()
2887                .enabled
2888        );
2889
2890        // Test that serialization always outputs the new format (without prefixes)
2891        let serialized = serde_json::to_string(&schema_from_old).unwrap();
2892
2893        // Should contain new format keys
2894        assert!(serialized.contains(r#""keys":"#));
2895        assert!(serialized.contains(r#""string":"#));
2896        assert!(serialized.contains(r#""fts_index":"#));
2897        assert!(serialized.contains(r#""int_inverted_index":"#));
2898        assert!(serialized.contains(r#""vector_index":"#));
2899
2900        // Should NOT contain old format keys
2901        assert!(!serialized.contains(r#""key_overrides":"#));
2902        assert!(!serialized.contains(r###""#string":"###));
2903        assert!(!serialized.contains(r###""$fts_index":"###));
2904        assert!(!serialized.contains(r###""$int_inverted_index":"###));
2905        assert!(!serialized.contains(r###""$vector_index":"###));
2906    }
2907}