hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65/// Parsed field definition
66#[derive(Debug, Clone)]
67pub struct FieldDef {
68    pub name: String,
69    pub field_type: FieldType,
70    pub indexed: bool,
71    pub stored: bool,
72    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
73    pub tokenizer: Option<String>,
74    /// Whether this field can have multiple values (serialized as array in JSON)
75    pub multi: bool,
76    /// Configuration for sparse vector fields
77    pub sparse_vector_config: Option<SparseVectorConfig>,
78    /// Configuration for dense vector fields
79    pub dense_vector_config: Option<DenseVectorConfig>,
80}
81
82/// Parsed index definition
83#[derive(Debug, Clone)]
84pub struct IndexDef {
85    pub name: String,
86    pub fields: Vec<FieldDef>,
87    pub default_fields: Vec<String>,
88    /// Query router rules for routing queries to specific fields
89    pub query_routers: Vec<QueryRouterRule>,
90}
91
92impl IndexDef {
93    /// Convert to a Schema
94    pub fn to_schema(&self) -> Schema {
95        let mut builder = SchemaBuilder::default();
96
97        for field in &self.fields {
98            let f = match field.field_type {
99                FieldType::Text => {
100                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
101                    builder.add_text_field_with_tokenizer(
102                        &field.name,
103                        field.indexed,
104                        field.stored,
105                        tokenizer,
106                    )
107                }
108                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
109                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
110                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
111                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
112                FieldType::Json => builder.add_json_field(&field.name, field.stored),
113                FieldType::SparseVector => {
114                    if let Some(config) = &field.sparse_vector_config {
115                        builder.add_sparse_vector_field_with_config(
116                            &field.name,
117                            field.indexed,
118                            field.stored,
119                            config.clone(),
120                        )
121                    } else {
122                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
123                    }
124                }
125                FieldType::DenseVector => {
126                    // Dense vector dimension must be specified via config
127                    let config = field
128                        .dense_vector_config
129                        .as_ref()
130                        .expect("DenseVector field requires dimension to be specified");
131                    builder.add_dense_vector_field_with_config(
132                        &field.name,
133                        field.indexed,
134                        field.stored,
135                        config.clone(),
136                    )
137                }
138            };
139            if field.multi {
140                builder.set_multi(f, true);
141            }
142        }
143
144        // Set default fields if specified
145        if !self.default_fields.is_empty() {
146            builder.set_default_fields(self.default_fields.clone());
147        }
148
149        // Set query routers if specified
150        if !self.query_routers.is_empty() {
151            builder.set_query_routers(self.query_routers.clone());
152        }
153
154        builder.build()
155    }
156
157    /// Create a QueryFieldRouter from the query router rules
158    ///
159    /// Returns None if there are no query router rules defined.
160    /// Returns Err if any regex pattern is invalid.
161    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
162        if self.query_routers.is_empty() {
163            return Ok(None);
164        }
165
166        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
167            .map(Some)
168            .map_err(Error::Schema)
169    }
170}
171
172/// Parse field type from string
173fn parse_field_type(type_str: &str) -> Result<FieldType> {
174    match type_str {
175        "text" | "string" | "str" => Ok(FieldType::Text),
176        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
177        "i64" | "int" | "integer" => Ok(FieldType::I64),
178        "f64" | "float" | "double" => Ok(FieldType::F64),
179        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
180        "json" => Ok(FieldType::Json),
181        "sparse_vector" => Ok(FieldType::SparseVector),
182        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
183        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
184    }
185}
186
187/// Index configuration parsed from indexed<...> attribute
188#[derive(Debug, Clone, Default)]
189struct IndexConfig {
190    index_type: Option<super::schema::VectorIndexType>,
191    centroids_path: Option<String>,
192    codebook_path: Option<String>,
193    nprobe: Option<usize>,
194    mrl_dim: Option<usize>,
195    // Sparse vector index params
196    quantization: Option<WeightQuantization>,
197    weight_threshold: Option<f32>,
198    // Sparse vector query-time config
199    query_tokenizer: Option<String>,
200    query_weighting: Option<QueryWeighting>,
201}
202
203/// Parse attributes from pest pair
204/// Returns (indexed, stored, multi, index_config)
205fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
206    let mut indexed = false;
207    let mut stored = false;
208    let mut multi = false;
209    let mut index_config = None;
210
211    for attr in pair.into_inner() {
212        if attr.as_rule() == Rule::attribute {
213            // attribute = { indexed_with_config | "indexed" | "stored" | "multi" }
214            // Check if it contains indexed_with_config
215            let mut found_indexed_with_config = false;
216            for inner in attr.clone().into_inner() {
217                if inner.as_rule() == Rule::indexed_with_config {
218                    indexed = true;
219                    index_config = Some(parse_index_config(inner));
220                    found_indexed_with_config = true;
221                    break;
222                }
223            }
224            if !found_indexed_with_config {
225                // Simple attribute
226                match attr.as_str() {
227                    "indexed" => indexed = true,
228                    "stored" => stored = true,
229                    "multi" => multi = true,
230                    _ => {}
231                }
232            }
233        }
234    }
235
236    (indexed, stored, multi, index_config)
237}
238
239/// Parse index configuration from indexed<...> attribute
240fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
241    let mut config = IndexConfig::default();
242
243    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
244    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
245    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
246
247    for inner in pair.into_inner() {
248        if inner.as_rule() == Rule::index_config_params {
249            for param in inner.into_inner() {
250                if param.as_rule() == Rule::index_config_param {
251                    for p in param.into_inner() {
252                        parse_single_index_config_param(&mut config, p);
253                    }
254                }
255            }
256        }
257    }
258
259    config
260}
261
262/// Parse a single index config parameter
263fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
264    use super::schema::VectorIndexType;
265
266    match p.as_rule() {
267        Rule::index_type_spec => {
268            config.index_type = Some(match p.as_str() {
269                "scann" => VectorIndexType::ScaNN,
270                "rabitq" => VectorIndexType::IvfRaBitQ,
271                _ => VectorIndexType::IvfRaBitQ,
272            });
273        }
274        Rule::index_type_kwarg => {
275            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
276            if let Some(t) = p.into_inner().next() {
277                config.index_type = Some(match t.as_str() {
278                    "scann" => VectorIndexType::ScaNN,
279                    "rabitq" => VectorIndexType::IvfRaBitQ,
280                    _ => VectorIndexType::IvfRaBitQ,
281                });
282            }
283        }
284        Rule::centroids_kwarg => {
285            // centroids_kwarg = { "centroids" ~ ":" ~ centroids_path }
286            // centroids_path = { "\"" ~ path_chars ~ "\"" }
287            if let Some(path) = p.into_inner().next()
288                && let Some(inner_path) = path.into_inner().next()
289            {
290                config.centroids_path = Some(inner_path.as_str().to_string());
291            }
292        }
293        Rule::codebook_kwarg => {
294            // codebook_kwarg = { "codebook" ~ ":" ~ codebook_path }
295            if let Some(path) = p.into_inner().next()
296                && let Some(inner_path) = path.into_inner().next()
297            {
298                config.codebook_path = Some(inner_path.as_str().to_string());
299            }
300        }
301        Rule::nprobe_kwarg => {
302            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
303            if let Some(n) = p.into_inner().next() {
304                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
305            }
306        }
307        Rule::mrl_dim_kwarg => {
308            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
309            if let Some(n) = p.into_inner().next() {
310                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
311            }
312        }
313        Rule::quantization_kwarg => {
314            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
315            if let Some(q) = p.into_inner().next() {
316                config.quantization = Some(match q.as_str() {
317                    "float32" | "f32" => WeightQuantization::Float32,
318                    "float16" | "f16" => WeightQuantization::Float16,
319                    "uint8" | "u8" => WeightQuantization::UInt8,
320                    "uint4" | "u4" => WeightQuantization::UInt4,
321                    _ => WeightQuantization::default(),
322                });
323            }
324        }
325        Rule::weight_threshold_kwarg => {
326            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
327            if let Some(t) = p.into_inner().next() {
328                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
329            }
330        }
331        Rule::query_config_block => {
332            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
333            parse_query_config_block(config, p);
334        }
335        _ => {}
336    }
337}
338
339/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
340fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
341    for inner in pair.into_inner() {
342        if inner.as_rule() == Rule::query_config_params {
343            for param in inner.into_inner() {
344                if param.as_rule() == Rule::query_config_param {
345                    for p in param.into_inner() {
346                        match p.as_rule() {
347                            Rule::query_tokenizer_kwarg => {
348                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
349                                if let Some(path) = p.into_inner().next()
350                                    && let Some(inner_path) = path.into_inner().next()
351                                {
352                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
353                                }
354                            }
355                            Rule::query_weighting_kwarg => {
356                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
357                                if let Some(w) = p.into_inner().next() {
358                                    config.query_weighting = Some(match w.as_str() {
359                                        "one" => QueryWeighting::One,
360                                        "idf" => QueryWeighting::Idf,
361                                        _ => QueryWeighting::One,
362                                    });
363                                }
364                            }
365                            _ => {}
366                        }
367                    }
368                }
369            }
370        }
371    }
372}
373
374/// Parse a field definition from pest pair
375fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
376    let mut inner = pair.into_inner();
377
378    let name = inner
379        .next()
380        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
381        .as_str()
382        .to_string();
383
384    let field_type_str = inner
385        .next()
386        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
387        .as_str();
388
389    let field_type = parse_field_type(field_type_str)?;
390
391    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
392    let mut tokenizer = None;
393    let mut sparse_vector_config = None;
394    let mut dense_vector_config = None;
395    let mut indexed = true;
396    let mut stored = true;
397    let mut multi = false;
398    let mut index_config: Option<IndexConfig> = None;
399
400    for item in inner {
401        match item.as_rule() {
402            Rule::tokenizer_spec => {
403                // Extract tokenizer name from <name>
404                if let Some(tok_name) = item.into_inner().next() {
405                    tokenizer = Some(tok_name.as_str().to_string());
406                }
407            }
408            Rule::sparse_vector_config => {
409                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
410                sparse_vector_config = Some(parse_sparse_vector_config(item));
411            }
412            Rule::dense_vector_config => {
413                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
414                dense_vector_config = Some(parse_dense_vector_config(item));
415            }
416            Rule::attributes => {
417                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
418                indexed = idx;
419                stored = sto;
420                multi = mul;
421                index_config = idx_cfg;
422            }
423            _ => {}
424        }
425    }
426
427    // Merge index config into vector configs if both exist
428    if let Some(idx_cfg) = index_config {
429        if let Some(ref mut dv_config) = dense_vector_config {
430            apply_index_config_to_dense_vector(dv_config, idx_cfg);
431        } else if field_type == FieldType::SparseVector {
432            // For sparse vectors, create default config if not present and apply index params
433            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
434            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
435        }
436    }
437
438    Ok(FieldDef {
439        name,
440        field_type,
441        indexed,
442        stored,
443        tokenizer,
444        multi,
445        sparse_vector_config,
446        dense_vector_config,
447    })
448}
449
450/// Apply index configuration from indexed<...> to DenseVectorConfig
451fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
452    use super::schema::VectorIndexType;
453
454    let nprobe = idx_cfg.nprobe.unwrap_or(32);
455
456    match idx_cfg.index_type {
457        Some(VectorIndexType::ScaNN) => {
458            config.index_type = VectorIndexType::ScaNN;
459            config.coarse_centroids_path = idx_cfg.centroids_path;
460            config.pq_codebook_path = idx_cfg.codebook_path;
461            config.nprobe = nprobe;
462        }
463        Some(VectorIndexType::IvfRaBitQ) => {
464            config.index_type = VectorIndexType::IvfRaBitQ;
465            config.coarse_centroids_path = idx_cfg.centroids_path;
466            config.nprobe = nprobe;
467        }
468        Some(VectorIndexType::RaBitQ) | None => {
469            // If centroids provided, use IVF-RaBitQ, otherwise plain RaBitQ
470            if idx_cfg.centroids_path.is_some() {
471                config.index_type = VectorIndexType::IvfRaBitQ;
472                config.coarse_centroids_path = idx_cfg.centroids_path;
473                config.nprobe = nprobe;
474            }
475            // else keep default RaBitQ
476        }
477    }
478
479    // Apply mrl_dim if specified
480    if idx_cfg.mrl_dim.is_some() {
481        config.mrl_dim = idx_cfg.mrl_dim;
482    }
483}
484
485/// Parse sparse_vector_config - only index_size (positional)
486/// Example: <u16> or <u32>
487fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
488    let mut index_size = IndexSize::default();
489
490    // Parse positional index_size_spec
491    for inner in pair.into_inner() {
492        if inner.as_rule() == Rule::index_size_spec {
493            index_size = match inner.as_str() {
494                "u16" => IndexSize::U16,
495                "u32" => IndexSize::U32,
496                _ => IndexSize::default(),
497            };
498        }
499    }
500
501    SparseVectorConfig {
502        index_size,
503        weight_quantization: WeightQuantization::default(),
504        weight_threshold: 0.0,
505        posting_list_pruning: None,
506        query_config: None,
507    }
508}
509
510/// Apply index configuration from indexed<...> to SparseVectorConfig
511fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
512    if let Some(q) = idx_cfg.quantization {
513        config.weight_quantization = q;
514    }
515    if let Some(t) = idx_cfg.weight_threshold {
516        config.weight_threshold = t;
517    }
518    // Apply query-time configuration if present
519    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
520        let query_config = config
521            .query_config
522            .get_or_insert(SparseQueryConfig::default());
523        if let Some(tokenizer) = idx_cfg.query_tokenizer {
524            query_config.tokenizer = Some(tokenizer);
525        }
526        if let Some(weighting) = idx_cfg.query_weighting {
527            query_config.weighting = weighting;
528        }
529    }
530}
531
532/// Parse dense_vector_config - only dims
533/// All index-related params (including mrl_dim) are now in indexed<...> attribute
534fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
535    let mut dim: usize = 0;
536
537    // Navigate to dense_vector_params
538    for params in pair.into_inner() {
539        if params.as_rule() == Rule::dense_vector_params {
540            for inner in params.into_inner() {
541                match inner.as_rule() {
542                    Rule::dense_vector_keyword_params => {
543                        // Parse keyword args: dims: N
544                        for kwarg in inner.into_inner() {
545                            if kwarg.as_rule() == Rule::dims_kwarg
546                                && let Some(d) = kwarg.into_inner().next()
547                            {
548                                dim = d.as_str().parse().unwrap_or(0);
549                            }
550                        }
551                    }
552                    Rule::dense_vector_positional_params => {
553                        // Parse positional: just dimension
554                        if let Some(dim_pair) = inner.into_inner().next() {
555                            dim = dim_pair.as_str().parse().unwrap_or(0);
556                        }
557                    }
558                    _ => {}
559                }
560            }
561        }
562    }
563
564    DenseVectorConfig::new(dim)
565}
566
567/// Parse default_fields definition
568fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
569    pair.into_inner().map(|p| p.as_str().to_string()).collect()
570}
571
572/// Parse a query router definition
573fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
574    let mut pattern = String::new();
575    let mut substitution = String::new();
576    let mut target_field = String::new();
577    let mut mode = RoutingMode::Additional;
578
579    for prop in pair.into_inner() {
580        if prop.as_rule() != Rule::query_router_prop {
581            continue;
582        }
583
584        for inner in prop.into_inner() {
585            match inner.as_rule() {
586                Rule::query_router_pattern => {
587                    if let Some(regex_str) = inner.into_inner().next() {
588                        pattern = parse_string_value(regex_str);
589                    }
590                }
591                Rule::query_router_substitution => {
592                    if let Some(quoted) = inner.into_inner().next() {
593                        substitution = parse_string_value(quoted);
594                    }
595                }
596                Rule::query_router_target => {
597                    if let Some(ident) = inner.into_inner().next() {
598                        target_field = ident.as_str().to_string();
599                    }
600                }
601                Rule::query_router_mode => {
602                    if let Some(mode_val) = inner.into_inner().next() {
603                        mode = match mode_val.as_str() {
604                            "exclusive" => RoutingMode::Exclusive,
605                            "additional" => RoutingMode::Additional,
606                            _ => RoutingMode::Additional,
607                        };
608                    }
609                }
610                _ => {}
611            }
612        }
613    }
614
615    if pattern.is_empty() {
616        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
617    }
618    if substitution.is_empty() {
619        return Err(Error::Schema(
620            "query_router missing 'substitution'".to_string(),
621        ));
622    }
623    if target_field.is_empty() {
624        return Err(Error::Schema(
625            "query_router missing 'target_field'".to_string(),
626        ));
627    }
628
629    Ok(QueryRouterRule {
630        pattern,
631        substitution,
632        target_field,
633        mode,
634    })
635}
636
637/// Parse a string value from quoted_string, raw_string, or regex_string
638fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
639    let s = pair.as_str();
640    match pair.as_rule() {
641        Rule::regex_string => {
642            // regex_string contains either raw_string or quoted_string
643            if let Some(inner) = pair.into_inner().next() {
644                parse_string_value(inner)
645            } else {
646                s.to_string()
647            }
648        }
649        Rule::raw_string => {
650            // r"..." - strip r" prefix and " suffix
651            s[2..s.len() - 1].to_string()
652        }
653        Rule::quoted_string => {
654            // "..." - strip quotes and handle escapes
655            let inner = &s[1..s.len() - 1];
656            // Simple escape handling
657            inner
658                .replace("\\n", "\n")
659                .replace("\\t", "\t")
660                .replace("\\\"", "\"")
661                .replace("\\\\", "\\")
662        }
663        _ => s.to_string(),
664    }
665}
666
667/// Parse an index definition from pest pair
668fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
669    let mut inner = pair.into_inner();
670
671    let name = inner
672        .next()
673        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
674        .as_str()
675        .to_string();
676
677    let mut fields = Vec::new();
678    let mut default_fields = Vec::new();
679    let mut query_routers = Vec::new();
680
681    for item in inner {
682        match item.as_rule() {
683            Rule::field_def => {
684                fields.push(parse_field_def(item)?);
685            }
686            Rule::default_fields_def => {
687                default_fields = parse_default_fields_def(item);
688            }
689            Rule::query_router_def => {
690                query_routers.push(parse_query_router_def(item)?);
691            }
692            _ => {}
693        }
694    }
695
696    Ok(IndexDef {
697        name,
698        fields,
699        default_fields,
700        query_routers,
701    })
702}
703
704/// Parse SDL from a string
705pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
706    let pairs = SdlParser::parse(Rule::file, input)
707        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
708
709    let mut indexes = Vec::new();
710
711    for pair in pairs {
712        if pair.as_rule() == Rule::file {
713            for inner in pair.into_inner() {
714                if inner.as_rule() == Rule::index_def {
715                    indexes.push(parse_index_def(inner)?);
716                }
717            }
718        }
719    }
720
721    Ok(indexes)
722}
723
724/// Parse SDL and return a single index definition
725pub fn parse_single_index(input: &str) -> Result<IndexDef> {
726    let indexes = parse_sdl(input)?;
727
728    if indexes.is_empty() {
729        return Err(Error::Schema("No index definition found".to_string()));
730    }
731
732    if indexes.len() > 1 {
733        return Err(Error::Schema(
734            "Multiple index definitions found, expected one".to_string(),
735        ));
736    }
737
738    Ok(indexes.into_iter().next().unwrap())
739}
740
741#[cfg(test)]
742mod tests {
743    use super::*;
744
745    #[test]
746    fn test_parse_simple_schema() {
747        let sdl = r#"
748            index articles {
749                field title: text [indexed, stored]
750                field body: text [indexed]
751            }
752        "#;
753
754        let indexes = parse_sdl(sdl).unwrap();
755        assert_eq!(indexes.len(), 1);
756
757        let index = &indexes[0];
758        assert_eq!(index.name, "articles");
759        assert_eq!(index.fields.len(), 2);
760
761        assert_eq!(index.fields[0].name, "title");
762        assert!(matches!(index.fields[0].field_type, FieldType::Text));
763        assert!(index.fields[0].indexed);
764        assert!(index.fields[0].stored);
765
766        assert_eq!(index.fields[1].name, "body");
767        assert!(matches!(index.fields[1].field_type, FieldType::Text));
768        assert!(index.fields[1].indexed);
769        assert!(!index.fields[1].stored);
770    }
771
772    #[test]
773    fn test_parse_all_field_types() {
774        let sdl = r#"
775            index test {
776                field text_field: text [indexed, stored]
777                field u64_field: u64 [indexed, stored]
778                field i64_field: i64 [indexed, stored]
779                field f64_field: f64 [indexed, stored]
780                field bytes_field: bytes [stored]
781            }
782        "#;
783
784        let indexes = parse_sdl(sdl).unwrap();
785        let index = &indexes[0];
786
787        assert!(matches!(index.fields[0].field_type, FieldType::Text));
788        assert!(matches!(index.fields[1].field_type, FieldType::U64));
789        assert!(matches!(index.fields[2].field_type, FieldType::I64));
790        assert!(matches!(index.fields[3].field_type, FieldType::F64));
791        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
792    }
793
794    #[test]
795    fn test_parse_with_comments() {
796        let sdl = r#"
797            # This is a comment
798            index articles {
799                # Title field
800                field title: text [indexed, stored]
801                field body: text [indexed] # inline comment not supported yet
802            }
803        "#;
804
805        let indexes = parse_sdl(sdl).unwrap();
806        assert_eq!(indexes[0].fields.len(), 2);
807    }
808
809    #[test]
810    fn test_parse_type_aliases() {
811        let sdl = r#"
812            index test {
813                field a: string [indexed]
814                field b: int [indexed]
815                field c: uint [indexed]
816                field d: float [indexed]
817                field e: binary [stored]
818            }
819        "#;
820
821        let indexes = parse_sdl(sdl).unwrap();
822        let index = &indexes[0];
823
824        assert!(matches!(index.fields[0].field_type, FieldType::Text));
825        assert!(matches!(index.fields[1].field_type, FieldType::I64));
826        assert!(matches!(index.fields[2].field_type, FieldType::U64));
827        assert!(matches!(index.fields[3].field_type, FieldType::F64));
828        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
829    }
830
831    #[test]
832    fn test_to_schema() {
833        let sdl = r#"
834            index articles {
835                field title: text [indexed, stored]
836                field views: u64 [indexed, stored]
837            }
838        "#;
839
840        let indexes = parse_sdl(sdl).unwrap();
841        let schema = indexes[0].to_schema();
842
843        assert!(schema.get_field("title").is_some());
844        assert!(schema.get_field("views").is_some());
845        assert!(schema.get_field("nonexistent").is_none());
846    }
847
848    #[test]
849    fn test_default_attributes() {
850        let sdl = r#"
851            index test {
852                field title: text
853            }
854        "#;
855
856        let indexes = parse_sdl(sdl).unwrap();
857        let field = &indexes[0].fields[0];
858
859        // Default should be indexed and stored
860        assert!(field.indexed);
861        assert!(field.stored);
862    }
863
864    #[test]
865    fn test_multiple_indexes() {
866        let sdl = r#"
867            index articles {
868                field title: text [indexed, stored]
869            }
870
871            index users {
872                field name: text [indexed, stored]
873                field email: text [indexed, stored]
874            }
875        "#;
876
877        let indexes = parse_sdl(sdl).unwrap();
878        assert_eq!(indexes.len(), 2);
879        assert_eq!(indexes[0].name, "articles");
880        assert_eq!(indexes[1].name, "users");
881    }
882
883    #[test]
884    fn test_tokenizer_spec() {
885        let sdl = r#"
886            index articles {
887                field title: text<en_stem> [indexed, stored]
888                field body: text<default> [indexed]
889                field author: text [indexed, stored]
890            }
891        "#;
892
893        let indexes = parse_sdl(sdl).unwrap();
894        let index = &indexes[0];
895
896        assert_eq!(index.fields[0].name, "title");
897        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
898
899        assert_eq!(index.fields[1].name, "body");
900        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
901
902        assert_eq!(index.fields[2].name, "author");
903        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
904    }
905
906    #[test]
907    fn test_tokenizer_in_schema() {
908        let sdl = r#"
909            index articles {
910                field title: text<german> [indexed, stored]
911                field body: text<en_stem> [indexed]
912            }
913        "#;
914
915        let indexes = parse_sdl(sdl).unwrap();
916        let schema = indexes[0].to_schema();
917
918        let title_field = schema.get_field("title").unwrap();
919        let title_entry = schema.get_field_entry(title_field).unwrap();
920        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
921
922        let body_field = schema.get_field("body").unwrap();
923        let body_entry = schema.get_field_entry(body_field).unwrap();
924        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
925    }
926
927    #[test]
928    fn test_query_router_basic() {
929        let sdl = r#"
930            index documents {
931                field title: text [indexed, stored]
932                field uri: text [indexed, stored]
933
934                query_router {
935                    pattern: "10\\.\\d{4,}/[^\\s]+"
936                    substitution: "doi://{0}"
937                    target_field: uris
938                    mode: exclusive
939                }
940            }
941        "#;
942
943        let indexes = parse_sdl(sdl).unwrap();
944        let index = &indexes[0];
945
946        assert_eq!(index.query_routers.len(), 1);
947        let router = &index.query_routers[0];
948        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
949        assert_eq!(router.substitution, "doi://{0}");
950        assert_eq!(router.target_field, "uris");
951        assert_eq!(router.mode, RoutingMode::Exclusive);
952    }
953
954    #[test]
955    fn test_query_router_raw_string() {
956        let sdl = r#"
957            index documents {
958                field uris: text [indexed, stored]
959
960                query_router {
961                    pattern: r"^pmid:(\d+)$"
962                    substitution: "pubmed://{1}"
963                    target_field: uris
964                    mode: additional
965                }
966            }
967        "#;
968
969        let indexes = parse_sdl(sdl).unwrap();
970        let router = &indexes[0].query_routers[0];
971
972        assert_eq!(router.pattern, r"^pmid:(\d+)$");
973        assert_eq!(router.substitution, "pubmed://{1}");
974        assert_eq!(router.mode, RoutingMode::Additional);
975    }
976
977    #[test]
978    fn test_multiple_query_routers() {
979        let sdl = r#"
980            index documents {
981                field uris: text [indexed, stored]
982
983                query_router {
984                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
985                    substitution: "doi://{1}"
986                    target_field: uris
987                    mode: exclusive
988                }
989
990                query_router {
991                    pattern: r"^pmid:(\d+)$"
992                    substitution: "pubmed://{1}"
993                    target_field: uris
994                    mode: exclusive
995                }
996
997                query_router {
998                    pattern: r"^arxiv:(\d+\.\d+)$"
999                    substitution: "arxiv://{1}"
1000                    target_field: uris
1001                    mode: additional
1002                }
1003            }
1004        "#;
1005
1006        let indexes = parse_sdl(sdl).unwrap();
1007        assert_eq!(indexes[0].query_routers.len(), 3);
1008    }
1009
1010    #[test]
1011    fn test_query_router_default_mode() {
1012        let sdl = r#"
1013            index documents {
1014                field uris: text [indexed, stored]
1015
1016                query_router {
1017                    pattern: r"test"
1018                    substitution: "{0}"
1019                    target_field: uris
1020                }
1021            }
1022        "#;
1023
1024        let indexes = parse_sdl(sdl).unwrap();
1025        // Default mode should be Additional
1026        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1027    }
1028
1029    #[test]
1030    fn test_multi_attribute() {
1031        let sdl = r#"
1032            index documents {
1033                field uris: text [indexed, stored, multi]
1034                field title: text [indexed, stored]
1035            }
1036        "#;
1037
1038        let indexes = parse_sdl(sdl).unwrap();
1039        assert_eq!(indexes.len(), 1);
1040
1041        let fields = &indexes[0].fields;
1042        assert_eq!(fields.len(), 2);
1043
1044        // uris should have multi=true
1045        assert_eq!(fields[0].name, "uris");
1046        assert!(fields[0].multi, "uris field should have multi=true");
1047
1048        // title should have multi=false
1049        assert_eq!(fields[1].name, "title");
1050        assert!(!fields[1].multi, "title field should have multi=false");
1051
1052        // Verify schema conversion preserves multi attribute
1053        let schema = indexes[0].to_schema();
1054        let uris_field = schema.get_field("uris").unwrap();
1055        let title_field = schema.get_field("title").unwrap();
1056
1057        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1058        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1059    }
1060
1061    #[test]
1062    fn test_sparse_vector_field() {
1063        let sdl = r#"
1064            index documents {
1065                field embedding: sparse_vector [indexed, stored]
1066            }
1067        "#;
1068
1069        let indexes = parse_sdl(sdl).unwrap();
1070        assert_eq!(indexes.len(), 1);
1071        assert_eq!(indexes[0].fields.len(), 1);
1072        assert_eq!(indexes[0].fields[0].name, "embedding");
1073        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1074        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1075    }
1076
1077    #[test]
1078    fn test_sparse_vector_with_config() {
1079        let sdl = r#"
1080            index documents {
1081                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1082                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1083            }
1084        "#;
1085
1086        let indexes = parse_sdl(sdl).unwrap();
1087        assert_eq!(indexes[0].fields.len(), 2);
1088
1089        // First field: u16 indices, uint8 quantization
1090        let f1 = &indexes[0].fields[0];
1091        assert_eq!(f1.name, "embedding");
1092        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1093        assert_eq!(config1.index_size, IndexSize::U16);
1094        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1095
1096        // Second field: u32 indices, float32 quantization
1097        let f2 = &indexes[0].fields[1];
1098        assert_eq!(f2.name, "dense");
1099        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1100        assert_eq!(config2.index_size, IndexSize::U32);
1101        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1102    }
1103
1104    #[test]
1105    fn test_sparse_vector_with_weight_threshold() {
1106        let sdl = r#"
1107            index documents {
1108                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1109                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1110            }
1111        "#;
1112
1113        let indexes = parse_sdl(sdl).unwrap();
1114        assert_eq!(indexes[0].fields.len(), 2);
1115
1116        // First field: u16 indices, uint8 quantization, threshold 0.1
1117        let f1 = &indexes[0].fields[0];
1118        assert_eq!(f1.name, "embedding");
1119        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1120        assert_eq!(config1.index_size, IndexSize::U16);
1121        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1122        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1123
1124        // Second field: u32 indices, float16 quantization, threshold 0.05
1125        let f2 = &indexes[0].fields[1];
1126        assert_eq!(f2.name, "embedding2");
1127        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1128        assert_eq!(config2.index_size, IndexSize::U32);
1129        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1130        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1131    }
1132
1133    #[test]
1134    fn test_dense_vector_field() {
1135        let sdl = r#"
1136            index documents {
1137                field embedding: dense_vector<768> [indexed, stored]
1138            }
1139        "#;
1140
1141        let indexes = parse_sdl(sdl).unwrap();
1142        assert_eq!(indexes.len(), 1);
1143        assert_eq!(indexes[0].fields.len(), 1);
1144
1145        let f = &indexes[0].fields[0];
1146        assert_eq!(f.name, "embedding");
1147        assert_eq!(f.field_type, FieldType::DenseVector);
1148
1149        let config = f.dense_vector_config.as_ref().unwrap();
1150        assert_eq!(config.dim, 768);
1151    }
1152
1153    #[test]
1154    fn test_dense_vector_alias() {
1155        let sdl = r#"
1156            index documents {
1157                field embedding: vector<1536> [indexed]
1158            }
1159        "#;
1160
1161        let indexes = parse_sdl(sdl).unwrap();
1162        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1163        assert_eq!(
1164            indexes[0].fields[0]
1165                .dense_vector_config
1166                .as_ref()
1167                .unwrap()
1168                .dim,
1169            1536
1170        );
1171    }
1172
1173    #[test]
1174    fn test_dense_vector_with_centroids() {
1175        let sdl = r#"
1176            index documents {
1177                field embedding: dense_vector<768> [indexed<centroids: "centroids.bin">, stored]
1178            }
1179        "#;
1180
1181        let indexes = parse_sdl(sdl).unwrap();
1182        assert_eq!(indexes.len(), 1);
1183
1184        let f = &indexes[0].fields[0];
1185        assert_eq!(f.name, "embedding");
1186        assert_eq!(f.field_type, FieldType::DenseVector);
1187
1188        let config = f.dense_vector_config.as_ref().unwrap();
1189        assert_eq!(config.dim, 768);
1190        assert_eq!(
1191            config.coarse_centroids_path.as_deref(),
1192            Some("centroids.bin")
1193        );
1194        assert_eq!(config.nprobe, 32); // default
1195    }
1196
1197    #[test]
1198    fn test_dense_vector_with_centroids_and_nprobe() {
1199        let sdl = r#"
1200            index documents {
1201                field embedding: dense_vector<1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1202            }
1203        "#;
1204
1205        let indexes = parse_sdl(sdl).unwrap();
1206        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1207
1208        assert_eq!(config.dim, 1536);
1209        assert_eq!(
1210            config.coarse_centroids_path.as_deref(),
1211            Some("/path/to/centroids.bin")
1212        );
1213        assert_eq!(config.nprobe, 64);
1214    }
1215
1216    #[test]
1217    fn test_dense_vector_keyword_syntax() {
1218        let sdl = r#"
1219            index documents {
1220                field embedding: dense_vector<dims: 1536> [indexed, stored]
1221            }
1222        "#;
1223
1224        let indexes = parse_sdl(sdl).unwrap();
1225        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1226
1227        assert_eq!(config.dim, 1536);
1228        assert!(config.coarse_centroids_path.is_none());
1229    }
1230
1231    #[test]
1232    fn test_dense_vector_keyword_syntax_full() {
1233        let sdl = r#"
1234            index documents {
1235                field embedding: dense_vector<dims: 1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1236            }
1237        "#;
1238
1239        let indexes = parse_sdl(sdl).unwrap();
1240        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1241
1242        assert_eq!(config.dim, 1536);
1243        assert_eq!(
1244            config.coarse_centroids_path.as_deref(),
1245            Some("/path/to/centroids.bin")
1246        );
1247        assert_eq!(config.nprobe, 64);
1248    }
1249
1250    #[test]
1251    fn test_dense_vector_keyword_syntax_partial() {
1252        let sdl = r#"
1253            index documents {
1254                field embedding: dense_vector<dims: 768> [indexed<centroids: "centroids.bin">]
1255            }
1256        "#;
1257
1258        let indexes = parse_sdl(sdl).unwrap();
1259        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1260
1261        assert_eq!(config.dim, 768);
1262        assert_eq!(
1263            config.coarse_centroids_path.as_deref(),
1264            Some("centroids.bin")
1265        );
1266        assert_eq!(config.nprobe, 32); // default
1267    }
1268
1269    #[test]
1270    fn test_dense_vector_scann_index() {
1271        use crate::dsl::schema::VectorIndexType;
1272
1273        let sdl = r#"
1274            index documents {
1275                field embedding: dense_vector<dims: 768> [indexed<scann, centroids: "centroids.bin", codebook: "pq_codebook.bin", nprobe: 64>]
1276            }
1277        "#;
1278
1279        let indexes = parse_sdl(sdl).unwrap();
1280        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1281
1282        assert_eq!(config.dim, 768);
1283        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1284        assert_eq!(
1285            config.coarse_centroids_path.as_deref(),
1286            Some("centroids.bin")
1287        );
1288        assert_eq!(config.pq_codebook_path.as_deref(), Some("pq_codebook.bin"));
1289        assert_eq!(config.nprobe, 64);
1290    }
1291
1292    #[test]
1293    fn test_dense_vector_rabitq_index() {
1294        use crate::dsl::schema::VectorIndexType;
1295
1296        let sdl = r#"
1297            index documents {
1298                field embedding: dense_vector<dims: 1536> [indexed<rabitq, centroids: "centroids.bin">]
1299            }
1300        "#;
1301
1302        let indexes = parse_sdl(sdl).unwrap();
1303        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1304
1305        assert_eq!(config.dim, 1536);
1306        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1307        assert_eq!(
1308            config.coarse_centroids_path.as_deref(),
1309            Some("centroids.bin")
1310        );
1311        assert!(config.pq_codebook_path.is_none());
1312    }
1313
1314    #[test]
1315    fn test_dense_vector_rabitq_no_centroids() {
1316        use crate::dsl::schema::VectorIndexType;
1317
1318        let sdl = r#"
1319            index documents {
1320                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1321            }
1322        "#;
1323
1324        let indexes = parse_sdl(sdl).unwrap();
1325        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1326
1327        assert_eq!(config.dim, 768);
1328        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1329        assert!(config.coarse_centroids_path.is_none());
1330    }
1331
1332    #[test]
1333    fn test_dense_vector_default_index_type() {
1334        use crate::dsl::schema::VectorIndexType;
1335
1336        // When no index type specified, should default to RaBitQ (basic)
1337        let sdl = r#"
1338            index documents {
1339                field embedding: dense_vector<dims: 768> [indexed]
1340            }
1341        "#;
1342
1343        let indexes = parse_sdl(sdl).unwrap();
1344        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1345
1346        assert_eq!(config.dim, 768);
1347        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1348    }
1349
1350    #[test]
1351    fn test_dense_vector_mrl_dim() {
1352        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1353        let sdl = r#"
1354            index documents {
1355                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1356            }
1357        "#;
1358
1359        let indexes = parse_sdl(sdl).unwrap();
1360        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1361
1362        assert_eq!(config.dim, 1536);
1363        assert_eq!(config.mrl_dim, Some(256));
1364        assert_eq!(config.index_dim(), 256);
1365    }
1366
1367    #[test]
1368    fn test_dense_vector_mrl_dim_with_centroids() {
1369        // Test mrl_dim combined with other index options
1370        let sdl = r#"
1371            index documents {
1372                field embedding: dense_vector<768> [indexed<centroids: "centroids.bin", nprobe: 64, mrl_dim: 128>]
1373            }
1374        "#;
1375
1376        let indexes = parse_sdl(sdl).unwrap();
1377        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1378
1379        assert_eq!(config.dim, 768);
1380        assert_eq!(config.mrl_dim, Some(128));
1381        assert_eq!(config.index_dim(), 128);
1382        assert_eq!(
1383            config.coarse_centroids_path.as_deref(),
1384            Some("centroids.bin")
1385        );
1386        assert_eq!(config.nprobe, 64);
1387    }
1388
1389    #[test]
1390    fn test_dense_vector_no_mrl_dim() {
1391        // Test that index_dim() returns full dim when mrl_dim is not set
1392        let sdl = r#"
1393            index documents {
1394                field embedding: dense_vector<dims: 768> [indexed]
1395            }
1396        "#;
1397
1398        let indexes = parse_sdl(sdl).unwrap();
1399        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1400
1401        assert_eq!(config.dim, 768);
1402        assert_eq!(config.mrl_dim, None);
1403        assert_eq!(config.index_dim(), 768);
1404    }
1405
1406    #[test]
1407    fn test_json_field_type() {
1408        let sdl = r#"
1409            index documents {
1410                field title: text [indexed, stored]
1411                field metadata: json [stored]
1412                field extra: json
1413            }
1414        "#;
1415
1416        let indexes = parse_sdl(sdl).unwrap();
1417        let index = &indexes[0];
1418
1419        assert_eq!(index.fields.len(), 3);
1420
1421        // Check JSON field
1422        assert_eq!(index.fields[1].name, "metadata");
1423        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1424        assert!(index.fields[1].stored);
1425        // JSON fields should not be indexed (enforced by add_json_field)
1426
1427        // Check default attributes for JSON field
1428        assert_eq!(index.fields[2].name, "extra");
1429        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1430
1431        // Verify schema conversion
1432        let schema = index.to_schema();
1433        let metadata_field = schema.get_field("metadata").unwrap();
1434        let entry = schema.get_field_entry(metadata_field).unwrap();
1435        assert_eq!(entry.field_type, FieldType::Json);
1436        assert!(!entry.indexed); // JSON fields are never indexed
1437        assert!(entry.stored);
1438    }
1439
1440    #[test]
1441    fn test_sparse_vector_query_config() {
1442        use crate::structures::QueryWeighting;
1443
1444        let sdl = r#"
1445            index documents {
1446                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1447            }
1448        "#;
1449
1450        let indexes = parse_sdl(sdl).unwrap();
1451        let index = &indexes[0];
1452
1453        assert_eq!(index.fields.len(), 1);
1454        assert_eq!(index.fields[0].name, "embedding");
1455        assert!(matches!(
1456            index.fields[0].field_type,
1457            FieldType::SparseVector
1458        ));
1459
1460        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1461        assert_eq!(config.index_size, IndexSize::U16);
1462        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1463
1464        // Check query config
1465        let query_config = config.query_config.as_ref().unwrap();
1466        assert_eq!(
1467            query_config.tokenizer.as_deref(),
1468            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1469        );
1470        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1471
1472        // Verify schema conversion preserves query config
1473        let schema = index.to_schema();
1474        let embedding_field = schema.get_field("embedding").unwrap();
1475        let entry = schema.get_field_entry(embedding_field).unwrap();
1476        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1477        let qc = sv_config.query_config.as_ref().unwrap();
1478        assert_eq!(
1479            qc.tokenizer.as_deref(),
1480            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1481        );
1482        assert_eq!(qc.weighting, QueryWeighting::Idf);
1483    }
1484
1485    #[test]
1486    fn test_sparse_vector_query_config_weighting_one() {
1487        use crate::structures::QueryWeighting;
1488
1489        let sdl = r#"
1490            index documents {
1491                field embedding: sparse_vector [indexed<query<weighting: one>>]
1492            }
1493        "#;
1494
1495        let indexes = parse_sdl(sdl).unwrap();
1496        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1497
1498        let query_config = config.query_config.as_ref().unwrap();
1499        assert!(query_config.tokenizer.is_none());
1500        assert_eq!(query_config.weighting, QueryWeighting::One);
1501    }
1502}