Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65/// Parsed field definition
66#[derive(Debug, Clone)]
67pub struct FieldDef {
68    pub name: String,
69    pub field_type: FieldType,
70    pub indexed: bool,
71    pub stored: bool,
72    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
73    pub tokenizer: Option<String>,
74    /// Whether this field can have multiple values (serialized as array in JSON)
75    pub multi: bool,
76    /// Position tracking mode for phrase queries and multi-field element tracking
77    pub positions: Option<super::schema::PositionMode>,
78    /// Configuration for sparse vector fields
79    pub sparse_vector_config: Option<SparseVectorConfig>,
80    /// Configuration for dense vector fields
81    pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84/// Parsed index definition
85#[derive(Debug, Clone)]
86pub struct IndexDef {
87    pub name: String,
88    pub fields: Vec<FieldDef>,
89    pub default_fields: Vec<String>,
90    /// Query router rules for routing queries to specific fields
91    pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95    /// Convert to a Schema
96    pub fn to_schema(&self) -> Schema {
97        let mut builder = SchemaBuilder::default();
98
99        for field in &self.fields {
100            let f = match field.field_type {
101                FieldType::Text => {
102                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103                    builder.add_text_field_with_tokenizer(
104                        &field.name,
105                        field.indexed,
106                        field.stored,
107                        tokenizer,
108                    )
109                }
110                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114                FieldType::Json => builder.add_json_field(&field.name, field.stored),
115                FieldType::SparseVector => {
116                    if let Some(config) = &field.sparse_vector_config {
117                        builder.add_sparse_vector_field_with_config(
118                            &field.name,
119                            field.indexed,
120                            field.stored,
121                            config.clone(),
122                        )
123                    } else {
124                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125                    }
126                }
127                FieldType::DenseVector => {
128                    // Dense vector dimension must be specified via config
129                    let config = field
130                        .dense_vector_config
131                        .as_ref()
132                        .expect("DenseVector field requires dimension to be specified");
133                    builder.add_dense_vector_field_with_config(
134                        &field.name,
135                        field.indexed,
136                        field.stored,
137                        config.clone(),
138                    )
139                }
140            };
141            if field.multi {
142                builder.set_multi(f, true);
143            }
144            // Set positions: explicit > auto (ordinal for multi vectors)
145            let positions = field.positions.or({
146                // Auto-set ordinal positions for multi-valued vector fields
147                if field.multi
148                    && matches!(
149                        field.field_type,
150                        FieldType::SparseVector | FieldType::DenseVector
151                    )
152                {
153                    Some(super::schema::PositionMode::Ordinal)
154                } else {
155                    None
156                }
157            });
158            if let Some(mode) = positions {
159                builder.set_positions(f, mode);
160            }
161        }
162
163        // Set default fields if specified
164        if !self.default_fields.is_empty() {
165            builder.set_default_fields(self.default_fields.clone());
166        }
167
168        // Set query routers if specified
169        if !self.query_routers.is_empty() {
170            builder.set_query_routers(self.query_routers.clone());
171        }
172
173        builder.build()
174    }
175
176    /// Create a QueryFieldRouter from the query router rules
177    ///
178    /// Returns None if there are no query router rules defined.
179    /// Returns Err if any regex pattern is invalid.
180    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181        if self.query_routers.is_empty() {
182            return Ok(None);
183        }
184
185        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186            .map(Some)
187            .map_err(Error::Schema)
188    }
189}
190
191/// Parse field type from string
192fn parse_field_type(type_str: &str) -> Result<FieldType> {
193    match type_str {
194        "text" | "string" | "str" => Ok(FieldType::Text),
195        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196        "i64" | "int" | "integer" => Ok(FieldType::I64),
197        "f64" | "float" | "double" => Ok(FieldType::F64),
198        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199        "json" => Ok(FieldType::Json),
200        "sparse_vector" => Ok(FieldType::SparseVector),
201        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203    }
204}
205
206/// Index configuration parsed from indexed<...> attribute
207#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209    index_type: Option<super::schema::VectorIndexType>,
210    num_clusters: Option<usize>,
211    nprobe: Option<usize>,
212    mrl_dim: Option<usize>,
213    build_threshold: Option<usize>,
214    // Sparse vector index params
215    quantization: Option<WeightQuantization>,
216    weight_threshold: Option<f32>,
217    block_size: Option<usize>,
218    posting_list_pruning: Option<f32>,
219    // Sparse vector query-time config
220    query_tokenizer: Option<String>,
221    query_weighting: Option<QueryWeighting>,
222    // Position tracking mode for phrase queries
223    positions: Option<super::schema::PositionMode>,
224}
225
226/// Parse attributes from pest pair
227/// Returns (indexed, stored, multi, index_config)
228/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
229/// multi is now inside stored<multi>
230fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
231    let mut indexed = false;
232    let mut stored = false;
233    let mut multi = false;
234    let mut index_config = None;
235
236    for attr in pair.into_inner() {
237        if attr.as_rule() == Rule::attribute {
238            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
239            let mut found_config = false;
240            for inner in attr.clone().into_inner() {
241                match inner.as_rule() {
242                    Rule::indexed_with_config => {
243                        indexed = true;
244                        index_config = Some(parse_index_config(inner));
245                        found_config = true;
246                        break;
247                    }
248                    Rule::stored_with_config => {
249                        stored = true;
250                        multi = true; // stored<multi>
251                        found_config = true;
252                        break;
253                    }
254                    _ => {}
255                }
256            }
257            if !found_config {
258                // Simple attribute
259                match attr.as_str() {
260                    "indexed" => indexed = true,
261                    "stored" => stored = true,
262                    _ => {}
263                }
264            }
265        }
266    }
267
268    (indexed, stored, multi, index_config)
269}
270
271/// Parse index configuration from indexed<...> attribute
272fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
273    let mut config = IndexConfig::default();
274
275    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
276    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
277    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
278
279    for inner in pair.into_inner() {
280        if inner.as_rule() == Rule::index_config_params {
281            for param in inner.into_inner() {
282                if param.as_rule() == Rule::index_config_param {
283                    for p in param.into_inner() {
284                        parse_single_index_config_param(&mut config, p);
285                    }
286                }
287            }
288        }
289    }
290
291    config
292}
293
294/// Parse a single index config parameter
295fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
296    use super::schema::VectorIndexType;
297
298    match p.as_rule() {
299        Rule::index_type_spec => {
300            config.index_type = Some(match p.as_str() {
301                "flat" => VectorIndexType::Flat,
302                "rabitq" => VectorIndexType::RaBitQ,
303                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
304                "scann" => VectorIndexType::ScaNN,
305                _ => VectorIndexType::RaBitQ,
306            });
307        }
308        Rule::index_type_kwarg => {
309            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
310            if let Some(t) = p.into_inner().next() {
311                config.index_type = Some(match t.as_str() {
312                    "flat" => VectorIndexType::Flat,
313                    "rabitq" => VectorIndexType::RaBitQ,
314                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
315                    "scann" => VectorIndexType::ScaNN,
316                    _ => VectorIndexType::RaBitQ,
317                });
318            }
319        }
320        Rule::num_clusters_kwarg => {
321            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
322            if let Some(n) = p.into_inner().next() {
323                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
324            }
325        }
326        Rule::build_threshold_kwarg => {
327            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
328            if let Some(n) = p.into_inner().next() {
329                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
330            }
331        }
332        Rule::nprobe_kwarg => {
333            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
334            if let Some(n) = p.into_inner().next() {
335                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
336            }
337        }
338        Rule::mrl_dim_kwarg => {
339            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
340            if let Some(n) = p.into_inner().next() {
341                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
342            }
343        }
344        Rule::quantization_kwarg => {
345            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
346            if let Some(q) = p.into_inner().next() {
347                config.quantization = Some(match q.as_str() {
348                    "float32" | "f32" => WeightQuantization::Float32,
349                    "float16" | "f16" => WeightQuantization::Float16,
350                    "uint8" | "u8" => WeightQuantization::UInt8,
351                    "uint4" | "u4" => WeightQuantization::UInt4,
352                    _ => WeightQuantization::default(),
353                });
354            }
355        }
356        Rule::weight_threshold_kwarg => {
357            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
358            if let Some(t) = p.into_inner().next() {
359                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
360            }
361        }
362        Rule::block_size_kwarg => {
363            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
364            if let Some(n) = p.into_inner().next() {
365                config.block_size = Some(n.as_str().parse().unwrap_or(128));
366            }
367        }
368        Rule::pruning_kwarg => {
369            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
370            if let Some(f) = p.into_inner().next() {
371                config.posting_list_pruning = Some(f.as_str().parse().unwrap_or(1.0));
372            }
373        }
374        Rule::query_config_block => {
375            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
376            parse_query_config_block(config, p);
377        }
378        Rule::positions_kwarg => {
379            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
380            use super::schema::PositionMode;
381            config.positions = Some(match p.as_str() {
382                "ordinal" => PositionMode::Ordinal,
383                "token_position" => PositionMode::TokenPosition,
384                _ => PositionMode::Full, // "positions" or any other value defaults to Full
385            });
386        }
387        _ => {}
388    }
389}
390
391/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
392fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
393    for inner in pair.into_inner() {
394        if inner.as_rule() == Rule::query_config_params {
395            for param in inner.into_inner() {
396                if param.as_rule() == Rule::query_config_param {
397                    for p in param.into_inner() {
398                        match p.as_rule() {
399                            Rule::query_tokenizer_kwarg => {
400                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
401                                if let Some(path) = p.into_inner().next()
402                                    && let Some(inner_path) = path.into_inner().next()
403                                {
404                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
405                                }
406                            }
407                            Rule::query_weighting_kwarg => {
408                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
409                                if let Some(w) = p.into_inner().next() {
410                                    config.query_weighting = Some(match w.as_str() {
411                                        "one" => QueryWeighting::One,
412                                        "idf" => QueryWeighting::Idf,
413                                        _ => QueryWeighting::One,
414                                    });
415                                }
416                            }
417                            _ => {}
418                        }
419                    }
420                }
421            }
422        }
423    }
424}
425
426/// Parse a field definition from pest pair
427fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
428    let mut inner = pair.into_inner();
429
430    let name = inner
431        .next()
432        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
433        .as_str()
434        .to_string();
435
436    let field_type_str = inner
437        .next()
438        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
439        .as_str();
440
441    let field_type = parse_field_type(field_type_str)?;
442
443    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
444    let mut tokenizer = None;
445    let mut sparse_vector_config = None;
446    let mut dense_vector_config = None;
447    let mut indexed = true;
448    let mut stored = true;
449    let mut multi = false;
450    let mut index_config: Option<IndexConfig> = None;
451
452    for item in inner {
453        match item.as_rule() {
454            Rule::tokenizer_spec => {
455                // Extract tokenizer name from <name>
456                if let Some(tok_name) = item.into_inner().next() {
457                    tokenizer = Some(tok_name.as_str().to_string());
458                }
459            }
460            Rule::sparse_vector_config => {
461                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
462                sparse_vector_config = Some(parse_sparse_vector_config(item));
463            }
464            Rule::dense_vector_config => {
465                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
466                dense_vector_config = Some(parse_dense_vector_config(item));
467            }
468            Rule::attributes => {
469                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
470                indexed = idx;
471                stored = sto;
472                multi = mul;
473                index_config = idx_cfg;
474            }
475            _ => {}
476        }
477    }
478
479    // Merge index config into vector configs if both exist
480    let mut positions = None;
481    if let Some(idx_cfg) = index_config {
482        positions = idx_cfg.positions;
483        if let Some(ref mut dv_config) = dense_vector_config {
484            apply_index_config_to_dense_vector(dv_config, idx_cfg);
485        } else if field_type == FieldType::SparseVector {
486            // For sparse vectors, create default config if not present and apply index params
487            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
488            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
489        }
490    }
491
492    Ok(FieldDef {
493        name,
494        field_type,
495        indexed,
496        stored,
497        tokenizer,
498        multi,
499        positions,
500        sparse_vector_config,
501        dense_vector_config,
502    })
503}
504
505/// Apply index configuration from indexed<...> to DenseVectorConfig
506fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
507    // Apply index type if specified
508    if let Some(index_type) = idx_cfg.index_type {
509        config.index_type = index_type;
510    }
511
512    // Apply num_clusters for IVF-based indexes
513    if idx_cfg.num_clusters.is_some() {
514        config.num_clusters = idx_cfg.num_clusters;
515    }
516
517    // Apply nprobe if specified
518    if let Some(nprobe) = idx_cfg.nprobe {
519        config.nprobe = nprobe;
520    }
521
522    // Apply mrl_dim if specified
523    if idx_cfg.mrl_dim.is_some() {
524        config.mrl_dim = idx_cfg.mrl_dim;
525    }
526
527    // Apply build_threshold if specified
528    if idx_cfg.build_threshold.is_some() {
529        config.build_threshold = idx_cfg.build_threshold;
530    }
531}
532
533/// Parse sparse_vector_config - only index_size (positional)
534/// Example: <u16> or <u32>
535fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
536    let mut index_size = IndexSize::default();
537
538    // Parse positional index_size_spec
539    for inner in pair.into_inner() {
540        if inner.as_rule() == Rule::index_size_spec {
541            index_size = match inner.as_str() {
542                "u16" => IndexSize::U16,
543                "u32" => IndexSize::U32,
544                _ => IndexSize::default(),
545            };
546        }
547    }
548
549    SparseVectorConfig {
550        index_size,
551        weight_quantization: WeightQuantization::default(),
552        weight_threshold: 0.0,
553        block_size: 128,
554        posting_list_pruning: None,
555        query_config: None,
556    }
557}
558
559/// Apply index configuration from indexed<...> to SparseVectorConfig
560fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
561    if let Some(q) = idx_cfg.quantization {
562        config.weight_quantization = q;
563    }
564    if let Some(t) = idx_cfg.weight_threshold {
565        config.weight_threshold = t;
566    }
567    if let Some(bs) = idx_cfg.block_size {
568        let adjusted = bs.next_power_of_two();
569        if adjusted != bs {
570            log::warn!(
571                "block_size {} adjusted to next power of two: {}",
572                bs,
573                adjusted
574            );
575        }
576        config.block_size = adjusted;
577    }
578    if let Some(p) = idx_cfg.posting_list_pruning {
579        let clamped = p.clamp(0.0, 1.0);
580        if (clamped - p).abs() > f32::EPSILON {
581            log::warn!(
582                "pruning {} clamped to valid range [0.0, 1.0]: {}",
583                p,
584                clamped
585            );
586        }
587        config.posting_list_pruning = Some(clamped);
588    }
589    // Apply query-time configuration if present
590    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
591        let query_config = config
592            .query_config
593            .get_or_insert(SparseQueryConfig::default());
594        if let Some(tokenizer) = idx_cfg.query_tokenizer {
595            query_config.tokenizer = Some(tokenizer);
596        }
597        if let Some(weighting) = idx_cfg.query_weighting {
598            query_config.weighting = weighting;
599        }
600    }
601}
602
603/// Parse dense_vector_config - only dims
604/// All index-related params (including mrl_dim) are now in indexed<...> attribute
605fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
606    let mut dim: usize = 0;
607
608    // Navigate to dense_vector_params
609    for params in pair.into_inner() {
610        if params.as_rule() == Rule::dense_vector_params {
611            for inner in params.into_inner() {
612                match inner.as_rule() {
613                    Rule::dense_vector_keyword_params => {
614                        // Parse keyword args: dims: N
615                        for kwarg in inner.into_inner() {
616                            if kwarg.as_rule() == Rule::dims_kwarg
617                                && let Some(d) = kwarg.into_inner().next()
618                            {
619                                dim = d.as_str().parse().unwrap_or(0);
620                            }
621                        }
622                    }
623                    Rule::dense_vector_positional_params => {
624                        // Parse positional: just dimension
625                        if let Some(dim_pair) = inner.into_inner().next() {
626                            dim = dim_pair.as_str().parse().unwrap_or(0);
627                        }
628                    }
629                    _ => {}
630                }
631            }
632        }
633    }
634
635    DenseVectorConfig::new(dim)
636}
637
638/// Parse default_fields definition
639fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
640    pair.into_inner().map(|p| p.as_str().to_string()).collect()
641}
642
643/// Parse a query router definition
644fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
645    let mut pattern = String::new();
646    let mut substitution = String::new();
647    let mut target_field = String::new();
648    let mut mode = RoutingMode::Additional;
649
650    for prop in pair.into_inner() {
651        if prop.as_rule() != Rule::query_router_prop {
652            continue;
653        }
654
655        for inner in prop.into_inner() {
656            match inner.as_rule() {
657                Rule::query_router_pattern => {
658                    if let Some(regex_str) = inner.into_inner().next() {
659                        pattern = parse_string_value(regex_str);
660                    }
661                }
662                Rule::query_router_substitution => {
663                    if let Some(quoted) = inner.into_inner().next() {
664                        substitution = parse_string_value(quoted);
665                    }
666                }
667                Rule::query_router_target => {
668                    if let Some(ident) = inner.into_inner().next() {
669                        target_field = ident.as_str().to_string();
670                    }
671                }
672                Rule::query_router_mode => {
673                    if let Some(mode_val) = inner.into_inner().next() {
674                        mode = match mode_val.as_str() {
675                            "exclusive" => RoutingMode::Exclusive,
676                            "additional" => RoutingMode::Additional,
677                            _ => RoutingMode::Additional,
678                        };
679                    }
680                }
681                _ => {}
682            }
683        }
684    }
685
686    if pattern.is_empty() {
687        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
688    }
689    if substitution.is_empty() {
690        return Err(Error::Schema(
691            "query_router missing 'substitution'".to_string(),
692        ));
693    }
694    if target_field.is_empty() {
695        return Err(Error::Schema(
696            "query_router missing 'target_field'".to_string(),
697        ));
698    }
699
700    Ok(QueryRouterRule {
701        pattern,
702        substitution,
703        target_field,
704        mode,
705    })
706}
707
708/// Parse a string value from quoted_string, raw_string, or regex_string
709fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
710    let s = pair.as_str();
711    match pair.as_rule() {
712        Rule::regex_string => {
713            // regex_string contains either raw_string or quoted_string
714            if let Some(inner) = pair.into_inner().next() {
715                parse_string_value(inner)
716            } else {
717                s.to_string()
718            }
719        }
720        Rule::raw_string => {
721            // r"..." - strip r" prefix and " suffix
722            s[2..s.len() - 1].to_string()
723        }
724        Rule::quoted_string => {
725            // "..." - strip quotes and handle escapes
726            let inner = &s[1..s.len() - 1];
727            // Simple escape handling
728            inner
729                .replace("\\n", "\n")
730                .replace("\\t", "\t")
731                .replace("\\\"", "\"")
732                .replace("\\\\", "\\")
733        }
734        _ => s.to_string(),
735    }
736}
737
738/// Parse an index definition from pest pair
739fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
740    let mut inner = pair.into_inner();
741
742    let name = inner
743        .next()
744        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
745        .as_str()
746        .to_string();
747
748    let mut fields = Vec::new();
749    let mut default_fields = Vec::new();
750    let mut query_routers = Vec::new();
751
752    for item in inner {
753        match item.as_rule() {
754            Rule::field_def => {
755                fields.push(parse_field_def(item)?);
756            }
757            Rule::default_fields_def => {
758                default_fields = parse_default_fields_def(item);
759            }
760            Rule::query_router_def => {
761                query_routers.push(parse_query_router_def(item)?);
762            }
763            _ => {}
764        }
765    }
766
767    Ok(IndexDef {
768        name,
769        fields,
770        default_fields,
771        query_routers,
772    })
773}
774
775/// Parse SDL from a string
776pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
777    let pairs = SdlParser::parse(Rule::file, input)
778        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
779
780    let mut indexes = Vec::new();
781
782    for pair in pairs {
783        if pair.as_rule() == Rule::file {
784            for inner in pair.into_inner() {
785                if inner.as_rule() == Rule::index_def {
786                    indexes.push(parse_index_def(inner)?);
787                }
788            }
789        }
790    }
791
792    Ok(indexes)
793}
794
795/// Parse SDL and return a single index definition
796pub fn parse_single_index(input: &str) -> Result<IndexDef> {
797    let indexes = parse_sdl(input)?;
798
799    if indexes.is_empty() {
800        return Err(Error::Schema("No index definition found".to_string()));
801    }
802
803    if indexes.len() > 1 {
804        return Err(Error::Schema(
805            "Multiple index definitions found, expected one".to_string(),
806        ));
807    }
808
809    Ok(indexes.into_iter().next().unwrap())
810}
811
812#[cfg(test)]
813mod tests {
814    use super::*;
815
816    #[test]
817    fn test_parse_simple_schema() {
818        let sdl = r#"
819            index articles {
820                field title: text [indexed, stored]
821                field body: text [indexed]
822            }
823        "#;
824
825        let indexes = parse_sdl(sdl).unwrap();
826        assert_eq!(indexes.len(), 1);
827
828        let index = &indexes[0];
829        assert_eq!(index.name, "articles");
830        assert_eq!(index.fields.len(), 2);
831
832        assert_eq!(index.fields[0].name, "title");
833        assert!(matches!(index.fields[0].field_type, FieldType::Text));
834        assert!(index.fields[0].indexed);
835        assert!(index.fields[0].stored);
836
837        assert_eq!(index.fields[1].name, "body");
838        assert!(matches!(index.fields[1].field_type, FieldType::Text));
839        assert!(index.fields[1].indexed);
840        assert!(!index.fields[1].stored);
841    }
842
843    #[test]
844    fn test_parse_all_field_types() {
845        let sdl = r#"
846            index test {
847                field text_field: text [indexed, stored]
848                field u64_field: u64 [indexed, stored]
849                field i64_field: i64 [indexed, stored]
850                field f64_field: f64 [indexed, stored]
851                field bytes_field: bytes [stored]
852            }
853        "#;
854
855        let indexes = parse_sdl(sdl).unwrap();
856        let index = &indexes[0];
857
858        assert!(matches!(index.fields[0].field_type, FieldType::Text));
859        assert!(matches!(index.fields[1].field_type, FieldType::U64));
860        assert!(matches!(index.fields[2].field_type, FieldType::I64));
861        assert!(matches!(index.fields[3].field_type, FieldType::F64));
862        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
863    }
864
865    #[test]
866    fn test_parse_with_comments() {
867        let sdl = r#"
868            # This is a comment
869            index articles {
870                # Title field
871                field title: text [indexed, stored]
872                field body: text [indexed] # inline comment not supported yet
873            }
874        "#;
875
876        let indexes = parse_sdl(sdl).unwrap();
877        assert_eq!(indexes[0].fields.len(), 2);
878    }
879
880    #[test]
881    fn test_parse_type_aliases() {
882        let sdl = r#"
883            index test {
884                field a: string [indexed]
885                field b: int [indexed]
886                field c: uint [indexed]
887                field d: float [indexed]
888                field e: binary [stored]
889            }
890        "#;
891
892        let indexes = parse_sdl(sdl).unwrap();
893        let index = &indexes[0];
894
895        assert!(matches!(index.fields[0].field_type, FieldType::Text));
896        assert!(matches!(index.fields[1].field_type, FieldType::I64));
897        assert!(matches!(index.fields[2].field_type, FieldType::U64));
898        assert!(matches!(index.fields[3].field_type, FieldType::F64));
899        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
900    }
901
902    #[test]
903    fn test_to_schema() {
904        let sdl = r#"
905            index articles {
906                field title: text [indexed, stored]
907                field views: u64 [indexed, stored]
908            }
909        "#;
910
911        let indexes = parse_sdl(sdl).unwrap();
912        let schema = indexes[0].to_schema();
913
914        assert!(schema.get_field("title").is_some());
915        assert!(schema.get_field("views").is_some());
916        assert!(schema.get_field("nonexistent").is_none());
917    }
918
919    #[test]
920    fn test_default_attributes() {
921        let sdl = r#"
922            index test {
923                field title: text
924            }
925        "#;
926
927        let indexes = parse_sdl(sdl).unwrap();
928        let field = &indexes[0].fields[0];
929
930        // Default should be indexed and stored
931        assert!(field.indexed);
932        assert!(field.stored);
933    }
934
935    #[test]
936    fn test_multiple_indexes() {
937        let sdl = r#"
938            index articles {
939                field title: text [indexed, stored]
940            }
941
942            index users {
943                field name: text [indexed, stored]
944                field email: text [indexed, stored]
945            }
946        "#;
947
948        let indexes = parse_sdl(sdl).unwrap();
949        assert_eq!(indexes.len(), 2);
950        assert_eq!(indexes[0].name, "articles");
951        assert_eq!(indexes[1].name, "users");
952    }
953
954    #[test]
955    fn test_tokenizer_spec() {
956        let sdl = r#"
957            index articles {
958                field title: text<en_stem> [indexed, stored]
959                field body: text<default> [indexed]
960                field author: text [indexed, stored]
961            }
962        "#;
963
964        let indexes = parse_sdl(sdl).unwrap();
965        let index = &indexes[0];
966
967        assert_eq!(index.fields[0].name, "title");
968        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
969
970        assert_eq!(index.fields[1].name, "body");
971        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
972
973        assert_eq!(index.fields[2].name, "author");
974        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
975    }
976
977    #[test]
978    fn test_tokenizer_in_schema() {
979        let sdl = r#"
980            index articles {
981                field title: text<german> [indexed, stored]
982                field body: text<en_stem> [indexed]
983            }
984        "#;
985
986        let indexes = parse_sdl(sdl).unwrap();
987        let schema = indexes[0].to_schema();
988
989        let title_field = schema.get_field("title").unwrap();
990        let title_entry = schema.get_field_entry(title_field).unwrap();
991        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
992
993        let body_field = schema.get_field("body").unwrap();
994        let body_entry = schema.get_field_entry(body_field).unwrap();
995        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
996    }
997
998    #[test]
999    fn test_query_router_basic() {
1000        let sdl = r#"
1001            index documents {
1002                field title: text [indexed, stored]
1003                field uri: text [indexed, stored]
1004
1005                query_router {
1006                    pattern: "10\\.\\d{4,}/[^\\s]+"
1007                    substitution: "doi://{0}"
1008                    target_field: uris
1009                    mode: exclusive
1010                }
1011            }
1012        "#;
1013
1014        let indexes = parse_sdl(sdl).unwrap();
1015        let index = &indexes[0];
1016
1017        assert_eq!(index.query_routers.len(), 1);
1018        let router = &index.query_routers[0];
1019        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1020        assert_eq!(router.substitution, "doi://{0}");
1021        assert_eq!(router.target_field, "uris");
1022        assert_eq!(router.mode, RoutingMode::Exclusive);
1023    }
1024
1025    #[test]
1026    fn test_query_router_raw_string() {
1027        let sdl = r#"
1028            index documents {
1029                field uris: text [indexed, stored]
1030
1031                query_router {
1032                    pattern: r"^pmid:(\d+)$"
1033                    substitution: "pubmed://{1}"
1034                    target_field: uris
1035                    mode: additional
1036                }
1037            }
1038        "#;
1039
1040        let indexes = parse_sdl(sdl).unwrap();
1041        let router = &indexes[0].query_routers[0];
1042
1043        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1044        assert_eq!(router.substitution, "pubmed://{1}");
1045        assert_eq!(router.mode, RoutingMode::Additional);
1046    }
1047
1048    #[test]
1049    fn test_multiple_query_routers() {
1050        let sdl = r#"
1051            index documents {
1052                field uris: text [indexed, stored]
1053
1054                query_router {
1055                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1056                    substitution: "doi://{1}"
1057                    target_field: uris
1058                    mode: exclusive
1059                }
1060
1061                query_router {
1062                    pattern: r"^pmid:(\d+)$"
1063                    substitution: "pubmed://{1}"
1064                    target_field: uris
1065                    mode: exclusive
1066                }
1067
1068                query_router {
1069                    pattern: r"^arxiv:(\d+\.\d+)$"
1070                    substitution: "arxiv://{1}"
1071                    target_field: uris
1072                    mode: additional
1073                }
1074            }
1075        "#;
1076
1077        let indexes = parse_sdl(sdl).unwrap();
1078        assert_eq!(indexes[0].query_routers.len(), 3);
1079    }
1080
1081    #[test]
1082    fn test_query_router_default_mode() {
1083        let sdl = r#"
1084            index documents {
1085                field uris: text [indexed, stored]
1086
1087                query_router {
1088                    pattern: r"test"
1089                    substitution: "{0}"
1090                    target_field: uris
1091                }
1092            }
1093        "#;
1094
1095        let indexes = parse_sdl(sdl).unwrap();
1096        // Default mode should be Additional
1097        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1098    }
1099
1100    #[test]
1101    fn test_multi_attribute() {
1102        let sdl = r#"
1103            index documents {
1104                field uris: text [indexed, stored<multi>]
1105                field title: text [indexed, stored]
1106            }
1107        "#;
1108
1109        let indexes = parse_sdl(sdl).unwrap();
1110        assert_eq!(indexes.len(), 1);
1111
1112        let fields = &indexes[0].fields;
1113        assert_eq!(fields.len(), 2);
1114
1115        // uris should have multi=true
1116        assert_eq!(fields[0].name, "uris");
1117        assert!(fields[0].multi, "uris field should have multi=true");
1118
1119        // title should have multi=false
1120        assert_eq!(fields[1].name, "title");
1121        assert!(!fields[1].multi, "title field should have multi=false");
1122
1123        // Verify schema conversion preserves multi attribute
1124        let schema = indexes[0].to_schema();
1125        let uris_field = schema.get_field("uris").unwrap();
1126        let title_field = schema.get_field("title").unwrap();
1127
1128        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1129        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1130    }
1131
1132    #[test]
1133    fn test_sparse_vector_field() {
1134        let sdl = r#"
1135            index documents {
1136                field embedding: sparse_vector [indexed, stored]
1137            }
1138        "#;
1139
1140        let indexes = parse_sdl(sdl).unwrap();
1141        assert_eq!(indexes.len(), 1);
1142        assert_eq!(indexes[0].fields.len(), 1);
1143        assert_eq!(indexes[0].fields[0].name, "embedding");
1144        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1145        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1146    }
1147
1148    #[test]
1149    fn test_sparse_vector_with_config() {
1150        let sdl = r#"
1151            index documents {
1152                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1153                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1154            }
1155        "#;
1156
1157        let indexes = parse_sdl(sdl).unwrap();
1158        assert_eq!(indexes[0].fields.len(), 2);
1159
1160        // First field: u16 indices, uint8 quantization
1161        let f1 = &indexes[0].fields[0];
1162        assert_eq!(f1.name, "embedding");
1163        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1164        assert_eq!(config1.index_size, IndexSize::U16);
1165        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1166
1167        // Second field: u32 indices, float32 quantization
1168        let f2 = &indexes[0].fields[1];
1169        assert_eq!(f2.name, "dense");
1170        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1171        assert_eq!(config2.index_size, IndexSize::U32);
1172        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1173    }
1174
1175    #[test]
1176    fn test_sparse_vector_with_weight_threshold() {
1177        let sdl = r#"
1178            index documents {
1179                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1180                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1181            }
1182        "#;
1183
1184        let indexes = parse_sdl(sdl).unwrap();
1185        assert_eq!(indexes[0].fields.len(), 2);
1186
1187        // First field: u16 indices, uint8 quantization, threshold 0.1
1188        let f1 = &indexes[0].fields[0];
1189        assert_eq!(f1.name, "embedding");
1190        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1191        assert_eq!(config1.index_size, IndexSize::U16);
1192        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1193        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1194
1195        // Second field: u32 indices, float16 quantization, threshold 0.05
1196        let f2 = &indexes[0].fields[1];
1197        assert_eq!(f2.name, "embedding2");
1198        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1199        assert_eq!(config2.index_size, IndexSize::U32);
1200        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1201        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1202    }
1203
1204    #[test]
1205    fn test_sparse_vector_with_pruning() {
1206        let sdl = r#"
1207            index documents {
1208                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1209            }
1210        "#;
1211
1212        let indexes = parse_sdl(sdl).unwrap();
1213        let f = &indexes[0].fields[0];
1214        assert_eq!(f.name, "embedding");
1215        let config = f.sparse_vector_config.as_ref().unwrap();
1216        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1217        assert_eq!(config.posting_list_pruning, Some(0.1));
1218    }
1219
1220    #[test]
1221    fn test_dense_vector_field() {
1222        let sdl = r#"
1223            index documents {
1224                field embedding: dense_vector<768> [indexed, stored]
1225            }
1226        "#;
1227
1228        let indexes = parse_sdl(sdl).unwrap();
1229        assert_eq!(indexes.len(), 1);
1230        assert_eq!(indexes[0].fields.len(), 1);
1231
1232        let f = &indexes[0].fields[0];
1233        assert_eq!(f.name, "embedding");
1234        assert_eq!(f.field_type, FieldType::DenseVector);
1235
1236        let config = f.dense_vector_config.as_ref().unwrap();
1237        assert_eq!(config.dim, 768);
1238    }
1239
1240    #[test]
1241    fn test_dense_vector_alias() {
1242        let sdl = r#"
1243            index documents {
1244                field embedding: vector<1536> [indexed]
1245            }
1246        "#;
1247
1248        let indexes = parse_sdl(sdl).unwrap();
1249        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1250        assert_eq!(
1251            indexes[0].fields[0]
1252                .dense_vector_config
1253                .as_ref()
1254                .unwrap()
1255                .dim,
1256            1536
1257        );
1258    }
1259
1260    #[test]
1261    fn test_dense_vector_with_num_clusters() {
1262        let sdl = r#"
1263            index documents {
1264                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1265            }
1266        "#;
1267
1268        let indexes = parse_sdl(sdl).unwrap();
1269        assert_eq!(indexes.len(), 1);
1270
1271        let f = &indexes[0].fields[0];
1272        assert_eq!(f.name, "embedding");
1273        assert_eq!(f.field_type, FieldType::DenseVector);
1274
1275        let config = f.dense_vector_config.as_ref().unwrap();
1276        assert_eq!(config.dim, 768);
1277        assert_eq!(config.num_clusters, Some(256));
1278        assert_eq!(config.nprobe, 32); // default
1279    }
1280
1281    #[test]
1282    fn test_dense_vector_with_num_clusters_and_nprobe() {
1283        let sdl = r#"
1284            index documents {
1285                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1286            }
1287        "#;
1288
1289        let indexes = parse_sdl(sdl).unwrap();
1290        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1291
1292        assert_eq!(config.dim, 1536);
1293        assert_eq!(config.num_clusters, Some(512));
1294        assert_eq!(config.nprobe, 64);
1295    }
1296
1297    #[test]
1298    fn test_dense_vector_keyword_syntax() {
1299        let sdl = r#"
1300            index documents {
1301                field embedding: dense_vector<dims: 1536> [indexed, stored]
1302            }
1303        "#;
1304
1305        let indexes = parse_sdl(sdl).unwrap();
1306        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1307
1308        assert_eq!(config.dim, 1536);
1309        assert!(config.num_clusters.is_none());
1310    }
1311
1312    #[test]
1313    fn test_dense_vector_keyword_syntax_full() {
1314        let sdl = r#"
1315            index documents {
1316                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1317            }
1318        "#;
1319
1320        let indexes = parse_sdl(sdl).unwrap();
1321        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1322
1323        assert_eq!(config.dim, 1536);
1324        assert_eq!(config.num_clusters, Some(256));
1325        assert_eq!(config.nprobe, 64);
1326    }
1327
1328    #[test]
1329    fn test_dense_vector_keyword_syntax_partial() {
1330        let sdl = r#"
1331            index documents {
1332                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1333            }
1334        "#;
1335
1336        let indexes = parse_sdl(sdl).unwrap();
1337        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1338
1339        assert_eq!(config.dim, 768);
1340        assert_eq!(config.num_clusters, Some(128));
1341        assert_eq!(config.nprobe, 32); // default
1342    }
1343
1344    #[test]
1345    fn test_dense_vector_scann_index() {
1346        use crate::dsl::schema::VectorIndexType;
1347
1348        let sdl = r#"
1349            index documents {
1350                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1351            }
1352        "#;
1353
1354        let indexes = parse_sdl(sdl).unwrap();
1355        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1356
1357        assert_eq!(config.dim, 768);
1358        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1359        assert_eq!(config.num_clusters, Some(256));
1360        assert_eq!(config.nprobe, 64);
1361    }
1362
1363    #[test]
1364    fn test_dense_vector_ivf_rabitq_index() {
1365        use crate::dsl::schema::VectorIndexType;
1366
1367        let sdl = r#"
1368            index documents {
1369                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1370            }
1371        "#;
1372
1373        let indexes = parse_sdl(sdl).unwrap();
1374        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1375
1376        assert_eq!(config.dim, 1536);
1377        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1378        assert_eq!(config.num_clusters, Some(512));
1379    }
1380
1381    #[test]
1382    fn test_dense_vector_rabitq_no_clusters() {
1383        use crate::dsl::schema::VectorIndexType;
1384
1385        let sdl = r#"
1386            index documents {
1387                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1388            }
1389        "#;
1390
1391        let indexes = parse_sdl(sdl).unwrap();
1392        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1393
1394        assert_eq!(config.dim, 768);
1395        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1396        assert!(config.num_clusters.is_none());
1397    }
1398
1399    #[test]
1400    fn test_dense_vector_flat_index() {
1401        use crate::dsl::schema::VectorIndexType;
1402
1403        let sdl = r#"
1404            index documents {
1405                field embedding: dense_vector<dims: 768> [indexed<flat>]
1406            }
1407        "#;
1408
1409        let indexes = parse_sdl(sdl).unwrap();
1410        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1411
1412        assert_eq!(config.dim, 768);
1413        assert_eq!(config.index_type, VectorIndexType::Flat);
1414    }
1415
1416    #[test]
1417    fn test_dense_vector_default_index_type() {
1418        use crate::dsl::schema::VectorIndexType;
1419
1420        // When no index type specified, should default to RaBitQ (basic)
1421        let sdl = r#"
1422            index documents {
1423                field embedding: dense_vector<dims: 768> [indexed]
1424            }
1425        "#;
1426
1427        let indexes = parse_sdl(sdl).unwrap();
1428        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1429
1430        assert_eq!(config.dim, 768);
1431        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1432    }
1433
1434    #[test]
1435    fn test_dense_vector_mrl_dim() {
1436        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1437        let sdl = r#"
1438            index documents {
1439                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1440            }
1441        "#;
1442
1443        let indexes = parse_sdl(sdl).unwrap();
1444        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1445
1446        assert_eq!(config.dim, 1536);
1447        assert_eq!(config.mrl_dim, Some(256));
1448        assert_eq!(config.index_dim(), 256);
1449    }
1450
1451    #[test]
1452    fn test_dense_vector_mrl_dim_with_num_clusters() {
1453        // Test mrl_dim combined with other index options
1454        let sdl = r#"
1455            index documents {
1456                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1457            }
1458        "#;
1459
1460        let indexes = parse_sdl(sdl).unwrap();
1461        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1462
1463        assert_eq!(config.dim, 768);
1464        assert_eq!(config.mrl_dim, Some(128));
1465        assert_eq!(config.index_dim(), 128);
1466        assert_eq!(config.num_clusters, Some(256));
1467        assert_eq!(config.nprobe, 64);
1468    }
1469
1470    #[test]
1471    fn test_dense_vector_no_mrl_dim() {
1472        // Test that index_dim() returns full dim when mrl_dim is not set
1473        let sdl = r#"
1474            index documents {
1475                field embedding: dense_vector<dims: 768> [indexed]
1476            }
1477        "#;
1478
1479        let indexes = parse_sdl(sdl).unwrap();
1480        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1481
1482        assert_eq!(config.dim, 768);
1483        assert_eq!(config.mrl_dim, None);
1484        assert_eq!(config.index_dim(), 768);
1485    }
1486
1487    #[test]
1488    fn test_json_field_type() {
1489        let sdl = r#"
1490            index documents {
1491                field title: text [indexed, stored]
1492                field metadata: json [stored]
1493                field extra: json
1494            }
1495        "#;
1496
1497        let indexes = parse_sdl(sdl).unwrap();
1498        let index = &indexes[0];
1499
1500        assert_eq!(index.fields.len(), 3);
1501
1502        // Check JSON field
1503        assert_eq!(index.fields[1].name, "metadata");
1504        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1505        assert!(index.fields[1].stored);
1506        // JSON fields should not be indexed (enforced by add_json_field)
1507
1508        // Check default attributes for JSON field
1509        assert_eq!(index.fields[2].name, "extra");
1510        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1511
1512        // Verify schema conversion
1513        let schema = index.to_schema();
1514        let metadata_field = schema.get_field("metadata").unwrap();
1515        let entry = schema.get_field_entry(metadata_field).unwrap();
1516        assert_eq!(entry.field_type, FieldType::Json);
1517        assert!(!entry.indexed); // JSON fields are never indexed
1518        assert!(entry.stored);
1519    }
1520
1521    #[test]
1522    fn test_sparse_vector_query_config() {
1523        use crate::structures::QueryWeighting;
1524
1525        let sdl = r#"
1526            index documents {
1527                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1528            }
1529        "#;
1530
1531        let indexes = parse_sdl(sdl).unwrap();
1532        let index = &indexes[0];
1533
1534        assert_eq!(index.fields.len(), 1);
1535        assert_eq!(index.fields[0].name, "embedding");
1536        assert!(matches!(
1537            index.fields[0].field_type,
1538            FieldType::SparseVector
1539        ));
1540
1541        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1542        assert_eq!(config.index_size, IndexSize::U16);
1543        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1544
1545        // Check query config
1546        let query_config = config.query_config.as_ref().unwrap();
1547        assert_eq!(
1548            query_config.tokenizer.as_deref(),
1549            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1550        );
1551        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1552
1553        // Verify schema conversion preserves query config
1554        let schema = index.to_schema();
1555        let embedding_field = schema.get_field("embedding").unwrap();
1556        let entry = schema.get_field_entry(embedding_field).unwrap();
1557        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1558        let qc = sv_config.query_config.as_ref().unwrap();
1559        assert_eq!(
1560            qc.tokenizer.as_deref(),
1561            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1562        );
1563        assert_eq!(qc.weighting, QueryWeighting::Idf);
1564    }
1565
1566    #[test]
1567    fn test_sparse_vector_query_config_weighting_one() {
1568        use crate::structures::QueryWeighting;
1569
1570        let sdl = r#"
1571            index documents {
1572                field embedding: sparse_vector [indexed<query<weighting: one>>]
1573            }
1574        "#;
1575
1576        let indexes = parse_sdl(sdl).unwrap();
1577        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1578
1579        let query_config = config.query_config.as_ref().unwrap();
1580        assert!(query_config.tokenizer.is_none());
1581        assert_eq!(query_config.weighting, QueryWeighting::One);
1582    }
1583}