Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65/// Parsed field definition
66#[derive(Debug, Clone)]
67pub struct FieldDef {
68    pub name: String,
69    pub field_type: FieldType,
70    pub indexed: bool,
71    pub stored: bool,
72    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
73    pub tokenizer: Option<String>,
74    /// Whether this field can have multiple values (serialized as array in JSON)
75    pub multi: bool,
76    /// Position tracking mode for phrase queries and multi-field element tracking
77    pub positions: Option<super::schema::PositionMode>,
78    /// Configuration for sparse vector fields
79    pub sparse_vector_config: Option<SparseVectorConfig>,
80    /// Configuration for dense vector fields
81    pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84/// Parsed index definition
85#[derive(Debug, Clone)]
86pub struct IndexDef {
87    pub name: String,
88    pub fields: Vec<FieldDef>,
89    pub default_fields: Vec<String>,
90    /// Query router rules for routing queries to specific fields
91    pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95    /// Convert to a Schema
96    pub fn to_schema(&self) -> Schema {
97        let mut builder = SchemaBuilder::default();
98
99        for field in &self.fields {
100            let f = match field.field_type {
101                FieldType::Text => {
102                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103                    builder.add_text_field_with_tokenizer(
104                        &field.name,
105                        field.indexed,
106                        field.stored,
107                        tokenizer,
108                    )
109                }
110                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114                FieldType::Json => builder.add_json_field(&field.name, field.stored),
115                FieldType::SparseVector => {
116                    if let Some(config) = &field.sparse_vector_config {
117                        builder.add_sparse_vector_field_with_config(
118                            &field.name,
119                            field.indexed,
120                            field.stored,
121                            config.clone(),
122                        )
123                    } else {
124                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125                    }
126                }
127                FieldType::DenseVector => {
128                    // Dense vector dimension must be specified via config
129                    let config = field
130                        .dense_vector_config
131                        .as_ref()
132                        .expect("DenseVector field requires dimension to be specified");
133                    builder.add_dense_vector_field_with_config(
134                        &field.name,
135                        field.indexed,
136                        field.stored,
137                        config.clone(),
138                    )
139                }
140            };
141            if field.multi {
142                builder.set_multi(f, true);
143            }
144            // Set positions: explicit > auto (ordinal for multi vectors)
145            let positions = field.positions.or({
146                // Auto-set ordinal positions for multi-valued vector fields
147                if field.multi
148                    && matches!(
149                        field.field_type,
150                        FieldType::SparseVector | FieldType::DenseVector
151                    )
152                {
153                    Some(super::schema::PositionMode::Ordinal)
154                } else {
155                    None
156                }
157            });
158            if let Some(mode) = positions {
159                builder.set_positions(f, mode);
160            }
161        }
162
163        // Set default fields if specified
164        if !self.default_fields.is_empty() {
165            builder.set_default_fields(self.default_fields.clone());
166        }
167
168        // Set query routers if specified
169        if !self.query_routers.is_empty() {
170            builder.set_query_routers(self.query_routers.clone());
171        }
172
173        builder.build()
174    }
175
176    /// Create a QueryFieldRouter from the query router rules
177    ///
178    /// Returns None if there are no query router rules defined.
179    /// Returns Err if any regex pattern is invalid.
180    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181        if self.query_routers.is_empty() {
182            return Ok(None);
183        }
184
185        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186            .map(Some)
187            .map_err(Error::Schema)
188    }
189}
190
191/// Parse field type from string
192fn parse_field_type(type_str: &str) -> Result<FieldType> {
193    match type_str {
194        "text" | "string" | "str" => Ok(FieldType::Text),
195        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196        "i64" | "int" | "integer" => Ok(FieldType::I64),
197        "f64" | "float" | "double" => Ok(FieldType::F64),
198        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199        "json" => Ok(FieldType::Json),
200        "sparse_vector" => Ok(FieldType::SparseVector),
201        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203    }
204}
205
206/// Index configuration parsed from indexed<...> attribute
207#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209    index_type: Option<super::schema::VectorIndexType>,
210    num_clusters: Option<usize>,
211    nprobe: Option<usize>,
212    mrl_dim: Option<usize>,
213    build_threshold: Option<usize>,
214    // Sparse vector index params
215    quantization: Option<WeightQuantization>,
216    weight_threshold: Option<f32>,
217    block_size: Option<usize>,
218    posting_list_pruning: Option<f32>,
219    // Sparse vector query-time config
220    query_tokenizer: Option<String>,
221    query_weighting: Option<QueryWeighting>,
222    // Position tracking mode for phrase queries
223    positions: Option<super::schema::PositionMode>,
224}
225
226/// Parse attributes from pest pair
227/// Returns (indexed, stored, multi, index_config)
228/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
229/// multi is now inside stored<multi>
230fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
231    let mut indexed = false;
232    let mut stored = false;
233    let mut multi = false;
234    let mut index_config = None;
235
236    for attr in pair.into_inner() {
237        if attr.as_rule() == Rule::attribute {
238            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
239            let mut found_config = false;
240            for inner in attr.clone().into_inner() {
241                match inner.as_rule() {
242                    Rule::indexed_with_config => {
243                        indexed = true;
244                        index_config = Some(parse_index_config(inner));
245                        found_config = true;
246                        break;
247                    }
248                    Rule::stored_with_config => {
249                        stored = true;
250                        multi = true; // stored<multi>
251                        found_config = true;
252                        break;
253                    }
254                    _ => {}
255                }
256            }
257            if !found_config {
258                // Simple attribute
259                match attr.as_str() {
260                    "indexed" => indexed = true,
261                    "stored" => stored = true,
262                    _ => {}
263                }
264            }
265        }
266    }
267
268    (indexed, stored, multi, index_config)
269}
270
271/// Parse index configuration from indexed<...> attribute
272fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
273    let mut config = IndexConfig::default();
274
275    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
276    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
277    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
278
279    for inner in pair.into_inner() {
280        if inner.as_rule() == Rule::index_config_params {
281            for param in inner.into_inner() {
282                if param.as_rule() == Rule::index_config_param {
283                    for p in param.into_inner() {
284                        parse_single_index_config_param(&mut config, p);
285                    }
286                }
287            }
288        }
289    }
290
291    config
292}
293
294/// Parse a single index config parameter
295fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
296    use super::schema::VectorIndexType;
297
298    match p.as_rule() {
299        Rule::index_type_spec => {
300            config.index_type = Some(match p.as_str() {
301                "flat" => VectorIndexType::Flat,
302                "rabitq" => VectorIndexType::RaBitQ,
303                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
304                "scann" => VectorIndexType::ScaNN,
305                _ => VectorIndexType::RaBitQ,
306            });
307        }
308        Rule::index_type_kwarg => {
309            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
310            if let Some(t) = p.into_inner().next() {
311                config.index_type = Some(match t.as_str() {
312                    "flat" => VectorIndexType::Flat,
313                    "rabitq" => VectorIndexType::RaBitQ,
314                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
315                    "scann" => VectorIndexType::ScaNN,
316                    _ => VectorIndexType::RaBitQ,
317                });
318            }
319        }
320        Rule::num_clusters_kwarg => {
321            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
322            if let Some(n) = p.into_inner().next() {
323                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
324            }
325        }
326        Rule::build_threshold_kwarg => {
327            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
328            if let Some(n) = p.into_inner().next() {
329                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
330            }
331        }
332        Rule::nprobe_kwarg => {
333            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
334            if let Some(n) = p.into_inner().next() {
335                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
336            }
337        }
338        Rule::mrl_dim_kwarg => {
339            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
340            if let Some(n) = p.into_inner().next() {
341                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
342            }
343        }
344        Rule::quantization_kwarg => {
345            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
346            if let Some(q) = p.into_inner().next() {
347                config.quantization = Some(match q.as_str() {
348                    "float32" | "f32" => WeightQuantization::Float32,
349                    "float16" | "f16" => WeightQuantization::Float16,
350                    "uint8" | "u8" => WeightQuantization::UInt8,
351                    "uint4" | "u4" => WeightQuantization::UInt4,
352                    _ => WeightQuantization::default(),
353                });
354            }
355        }
356        Rule::weight_threshold_kwarg => {
357            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
358            if let Some(t) = p.into_inner().next() {
359                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
360            }
361        }
362        Rule::block_size_kwarg => {
363            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
364            if let Some(n) = p.into_inner().next() {
365                config.block_size = Some(n.as_str().parse().unwrap_or(128));
366            }
367        }
368        Rule::pruning_kwarg => {
369            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
370            if let Some(f) = p.into_inner().next() {
371                config.posting_list_pruning = Some(f.as_str().parse().unwrap_or(1.0));
372            }
373        }
374        Rule::query_config_block => {
375            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
376            parse_query_config_block(config, p);
377        }
378        Rule::positions_kwarg => {
379            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
380            use super::schema::PositionMode;
381            config.positions = Some(match p.as_str() {
382                "ordinal" => PositionMode::Ordinal,
383                "token_position" => PositionMode::TokenPosition,
384                _ => PositionMode::Full, // "positions" or any other value defaults to Full
385            });
386        }
387        _ => {}
388    }
389}
390
391/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
392fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
393    for inner in pair.into_inner() {
394        if inner.as_rule() == Rule::query_config_params {
395            for param in inner.into_inner() {
396                if param.as_rule() == Rule::query_config_param {
397                    for p in param.into_inner() {
398                        match p.as_rule() {
399                            Rule::query_tokenizer_kwarg => {
400                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
401                                if let Some(path) = p.into_inner().next()
402                                    && let Some(inner_path) = path.into_inner().next()
403                                {
404                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
405                                }
406                            }
407                            Rule::query_weighting_kwarg => {
408                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
409                                if let Some(w) = p.into_inner().next() {
410                                    config.query_weighting = Some(match w.as_str() {
411                                        "one" => QueryWeighting::One,
412                                        "idf" => QueryWeighting::Idf,
413                                        "idf_file" => QueryWeighting::IdfFile,
414                                        _ => QueryWeighting::One,
415                                    });
416                                }
417                            }
418                            _ => {}
419                        }
420                    }
421                }
422            }
423        }
424    }
425}
426
427/// Parse a field definition from pest pair
428fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
429    let mut inner = pair.into_inner();
430
431    let name = inner
432        .next()
433        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
434        .as_str()
435        .to_string();
436
437    let field_type_str = inner
438        .next()
439        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
440        .as_str();
441
442    let field_type = parse_field_type(field_type_str)?;
443
444    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
445    let mut tokenizer = None;
446    let mut sparse_vector_config = None;
447    let mut dense_vector_config = None;
448    let mut indexed = true;
449    let mut stored = true;
450    let mut multi = false;
451    let mut index_config: Option<IndexConfig> = None;
452
453    for item in inner {
454        match item.as_rule() {
455            Rule::tokenizer_spec => {
456                // Extract tokenizer name from <name>
457                if let Some(tok_name) = item.into_inner().next() {
458                    tokenizer = Some(tok_name.as_str().to_string());
459                }
460            }
461            Rule::sparse_vector_config => {
462                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
463                sparse_vector_config = Some(parse_sparse_vector_config(item));
464            }
465            Rule::dense_vector_config => {
466                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
467                dense_vector_config = Some(parse_dense_vector_config(item));
468            }
469            Rule::attributes => {
470                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
471                indexed = idx;
472                stored = sto;
473                multi = mul;
474                index_config = idx_cfg;
475            }
476            _ => {}
477        }
478    }
479
480    // Merge index config into vector configs if both exist
481    let mut positions = None;
482    if let Some(idx_cfg) = index_config {
483        positions = idx_cfg.positions;
484        if let Some(ref mut dv_config) = dense_vector_config {
485            apply_index_config_to_dense_vector(dv_config, idx_cfg);
486        } else if field_type == FieldType::SparseVector {
487            // For sparse vectors, create default config if not present and apply index params
488            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
489            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
490        }
491    }
492
493    Ok(FieldDef {
494        name,
495        field_type,
496        indexed,
497        stored,
498        tokenizer,
499        multi,
500        positions,
501        sparse_vector_config,
502        dense_vector_config,
503    })
504}
505
506/// Apply index configuration from indexed<...> to DenseVectorConfig
507fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
508    // Apply index type if specified
509    if let Some(index_type) = idx_cfg.index_type {
510        config.index_type = index_type;
511    }
512
513    // Apply num_clusters for IVF-based indexes
514    if idx_cfg.num_clusters.is_some() {
515        config.num_clusters = idx_cfg.num_clusters;
516    }
517
518    // Apply nprobe if specified
519    if let Some(nprobe) = idx_cfg.nprobe {
520        config.nprobe = nprobe;
521    }
522
523    // Apply mrl_dim if specified
524    if idx_cfg.mrl_dim.is_some() {
525        config.mrl_dim = idx_cfg.mrl_dim;
526    }
527
528    // Apply build_threshold if specified
529    if idx_cfg.build_threshold.is_some() {
530        config.build_threshold = idx_cfg.build_threshold;
531    }
532}
533
534/// Parse sparse_vector_config - only index_size (positional)
535/// Example: <u16> or <u32>
536fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
537    let mut index_size = IndexSize::default();
538
539    // Parse positional index_size_spec
540    for inner in pair.into_inner() {
541        if inner.as_rule() == Rule::index_size_spec {
542            index_size = match inner.as_str() {
543                "u16" => IndexSize::U16,
544                "u32" => IndexSize::U32,
545                _ => IndexSize::default(),
546            };
547        }
548    }
549
550    SparseVectorConfig {
551        index_size,
552        weight_quantization: WeightQuantization::default(),
553        weight_threshold: 0.0,
554        block_size: 128,
555        posting_list_pruning: None,
556        query_config: None,
557    }
558}
559
560/// Apply index configuration from indexed<...> to SparseVectorConfig
561fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
562    if let Some(q) = idx_cfg.quantization {
563        config.weight_quantization = q;
564    }
565    if let Some(t) = idx_cfg.weight_threshold {
566        config.weight_threshold = t;
567    }
568    if let Some(bs) = idx_cfg.block_size {
569        let adjusted = bs.next_power_of_two();
570        if adjusted != bs {
571            log::warn!(
572                "block_size {} adjusted to next power of two: {}",
573                bs,
574                adjusted
575            );
576        }
577        config.block_size = adjusted;
578    }
579    if let Some(p) = idx_cfg.posting_list_pruning {
580        let clamped = p.clamp(0.0, 1.0);
581        if (clamped - p).abs() > f32::EPSILON {
582            log::warn!(
583                "pruning {} clamped to valid range [0.0, 1.0]: {}",
584                p,
585                clamped
586            );
587        }
588        config.posting_list_pruning = Some(clamped);
589    }
590    // Apply query-time configuration if present
591    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
592        let query_config = config
593            .query_config
594            .get_or_insert(SparseQueryConfig::default());
595        if let Some(tokenizer) = idx_cfg.query_tokenizer {
596            query_config.tokenizer = Some(tokenizer);
597        }
598        if let Some(weighting) = idx_cfg.query_weighting {
599            query_config.weighting = weighting;
600        }
601    }
602}
603
604/// Parse dense_vector_config - only dims
605/// All index-related params (including mrl_dim) are now in indexed<...> attribute
606fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
607    let mut dim: usize = 0;
608
609    // Navigate to dense_vector_params
610    for params in pair.into_inner() {
611        if params.as_rule() == Rule::dense_vector_params {
612            for inner in params.into_inner() {
613                match inner.as_rule() {
614                    Rule::dense_vector_keyword_params => {
615                        // Parse keyword args: dims: N
616                        for kwarg in inner.into_inner() {
617                            if kwarg.as_rule() == Rule::dims_kwarg
618                                && let Some(d) = kwarg.into_inner().next()
619                            {
620                                dim = d.as_str().parse().unwrap_or(0);
621                            }
622                        }
623                    }
624                    Rule::dense_vector_positional_params => {
625                        // Parse positional: just dimension
626                        if let Some(dim_pair) = inner.into_inner().next() {
627                            dim = dim_pair.as_str().parse().unwrap_or(0);
628                        }
629                    }
630                    _ => {}
631                }
632            }
633        }
634    }
635
636    DenseVectorConfig::new(dim)
637}
638
639/// Parse default_fields definition
640fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
641    pair.into_inner().map(|p| p.as_str().to_string()).collect()
642}
643
644/// Parse a query router definition
645fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
646    let mut pattern = String::new();
647    let mut substitution = String::new();
648    let mut target_field = String::new();
649    let mut mode = RoutingMode::Additional;
650
651    for prop in pair.into_inner() {
652        if prop.as_rule() != Rule::query_router_prop {
653            continue;
654        }
655
656        for inner in prop.into_inner() {
657            match inner.as_rule() {
658                Rule::query_router_pattern => {
659                    if let Some(regex_str) = inner.into_inner().next() {
660                        pattern = parse_string_value(regex_str);
661                    }
662                }
663                Rule::query_router_substitution => {
664                    if let Some(quoted) = inner.into_inner().next() {
665                        substitution = parse_string_value(quoted);
666                    }
667                }
668                Rule::query_router_target => {
669                    if let Some(ident) = inner.into_inner().next() {
670                        target_field = ident.as_str().to_string();
671                    }
672                }
673                Rule::query_router_mode => {
674                    if let Some(mode_val) = inner.into_inner().next() {
675                        mode = match mode_val.as_str() {
676                            "exclusive" => RoutingMode::Exclusive,
677                            "additional" => RoutingMode::Additional,
678                            _ => RoutingMode::Additional,
679                        };
680                    }
681                }
682                _ => {}
683            }
684        }
685    }
686
687    if pattern.is_empty() {
688        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
689    }
690    if substitution.is_empty() {
691        return Err(Error::Schema(
692            "query_router missing 'substitution'".to_string(),
693        ));
694    }
695    if target_field.is_empty() {
696        return Err(Error::Schema(
697            "query_router missing 'target_field'".to_string(),
698        ));
699    }
700
701    Ok(QueryRouterRule {
702        pattern,
703        substitution,
704        target_field,
705        mode,
706    })
707}
708
709/// Parse a string value from quoted_string, raw_string, or regex_string
710fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
711    let s = pair.as_str();
712    match pair.as_rule() {
713        Rule::regex_string => {
714            // regex_string contains either raw_string or quoted_string
715            if let Some(inner) = pair.into_inner().next() {
716                parse_string_value(inner)
717            } else {
718                s.to_string()
719            }
720        }
721        Rule::raw_string => {
722            // r"..." - strip r" prefix and " suffix
723            s[2..s.len() - 1].to_string()
724        }
725        Rule::quoted_string => {
726            // "..." - strip quotes and handle escapes
727            let inner = &s[1..s.len() - 1];
728            // Simple escape handling
729            inner
730                .replace("\\n", "\n")
731                .replace("\\t", "\t")
732                .replace("\\\"", "\"")
733                .replace("\\\\", "\\")
734        }
735        _ => s.to_string(),
736    }
737}
738
739/// Parse an index definition from pest pair
740fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
741    let mut inner = pair.into_inner();
742
743    let name = inner
744        .next()
745        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
746        .as_str()
747        .to_string();
748
749    let mut fields = Vec::new();
750    let mut default_fields = Vec::new();
751    let mut query_routers = Vec::new();
752
753    for item in inner {
754        match item.as_rule() {
755            Rule::field_def => {
756                fields.push(parse_field_def(item)?);
757            }
758            Rule::default_fields_def => {
759                default_fields = parse_default_fields_def(item);
760            }
761            Rule::query_router_def => {
762                query_routers.push(parse_query_router_def(item)?);
763            }
764            _ => {}
765        }
766    }
767
768    Ok(IndexDef {
769        name,
770        fields,
771        default_fields,
772        query_routers,
773    })
774}
775
776/// Parse SDL from a string
777pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
778    let pairs = SdlParser::parse(Rule::file, input)
779        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
780
781    let mut indexes = Vec::new();
782
783    for pair in pairs {
784        if pair.as_rule() == Rule::file {
785            for inner in pair.into_inner() {
786                if inner.as_rule() == Rule::index_def {
787                    indexes.push(parse_index_def(inner)?);
788                }
789            }
790        }
791    }
792
793    Ok(indexes)
794}
795
796/// Parse SDL and return a single index definition
797pub fn parse_single_index(input: &str) -> Result<IndexDef> {
798    let indexes = parse_sdl(input)?;
799
800    if indexes.is_empty() {
801        return Err(Error::Schema("No index definition found".to_string()));
802    }
803
804    if indexes.len() > 1 {
805        return Err(Error::Schema(
806            "Multiple index definitions found, expected one".to_string(),
807        ));
808    }
809
810    Ok(indexes.into_iter().next().unwrap())
811}
812
813#[cfg(test)]
814mod tests {
815    use super::*;
816
817    #[test]
818    fn test_parse_simple_schema() {
819        let sdl = r#"
820            index articles {
821                field title: text [indexed, stored]
822                field body: text [indexed]
823            }
824        "#;
825
826        let indexes = parse_sdl(sdl).unwrap();
827        assert_eq!(indexes.len(), 1);
828
829        let index = &indexes[0];
830        assert_eq!(index.name, "articles");
831        assert_eq!(index.fields.len(), 2);
832
833        assert_eq!(index.fields[0].name, "title");
834        assert!(matches!(index.fields[0].field_type, FieldType::Text));
835        assert!(index.fields[0].indexed);
836        assert!(index.fields[0].stored);
837
838        assert_eq!(index.fields[1].name, "body");
839        assert!(matches!(index.fields[1].field_type, FieldType::Text));
840        assert!(index.fields[1].indexed);
841        assert!(!index.fields[1].stored);
842    }
843
844    #[test]
845    fn test_parse_all_field_types() {
846        let sdl = r#"
847            index test {
848                field text_field: text [indexed, stored]
849                field u64_field: u64 [indexed, stored]
850                field i64_field: i64 [indexed, stored]
851                field f64_field: f64 [indexed, stored]
852                field bytes_field: bytes [stored]
853            }
854        "#;
855
856        let indexes = parse_sdl(sdl).unwrap();
857        let index = &indexes[0];
858
859        assert!(matches!(index.fields[0].field_type, FieldType::Text));
860        assert!(matches!(index.fields[1].field_type, FieldType::U64));
861        assert!(matches!(index.fields[2].field_type, FieldType::I64));
862        assert!(matches!(index.fields[3].field_type, FieldType::F64));
863        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
864    }
865
866    #[test]
867    fn test_parse_with_comments() {
868        let sdl = r#"
869            # This is a comment
870            index articles {
871                # Title field
872                field title: text [indexed, stored]
873                field body: text [indexed] # inline comment not supported yet
874            }
875        "#;
876
877        let indexes = parse_sdl(sdl).unwrap();
878        assert_eq!(indexes[0].fields.len(), 2);
879    }
880
881    #[test]
882    fn test_parse_type_aliases() {
883        let sdl = r#"
884            index test {
885                field a: string [indexed]
886                field b: int [indexed]
887                field c: uint [indexed]
888                field d: float [indexed]
889                field e: binary [stored]
890            }
891        "#;
892
893        let indexes = parse_sdl(sdl).unwrap();
894        let index = &indexes[0];
895
896        assert!(matches!(index.fields[0].field_type, FieldType::Text));
897        assert!(matches!(index.fields[1].field_type, FieldType::I64));
898        assert!(matches!(index.fields[2].field_type, FieldType::U64));
899        assert!(matches!(index.fields[3].field_type, FieldType::F64));
900        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
901    }
902
903    #[test]
904    fn test_to_schema() {
905        let sdl = r#"
906            index articles {
907                field title: text [indexed, stored]
908                field views: u64 [indexed, stored]
909            }
910        "#;
911
912        let indexes = parse_sdl(sdl).unwrap();
913        let schema = indexes[0].to_schema();
914
915        assert!(schema.get_field("title").is_some());
916        assert!(schema.get_field("views").is_some());
917        assert!(schema.get_field("nonexistent").is_none());
918    }
919
920    #[test]
921    fn test_default_attributes() {
922        let sdl = r#"
923            index test {
924                field title: text
925            }
926        "#;
927
928        let indexes = parse_sdl(sdl).unwrap();
929        let field = &indexes[0].fields[0];
930
931        // Default should be indexed and stored
932        assert!(field.indexed);
933        assert!(field.stored);
934    }
935
936    #[test]
937    fn test_multiple_indexes() {
938        let sdl = r#"
939            index articles {
940                field title: text [indexed, stored]
941            }
942
943            index users {
944                field name: text [indexed, stored]
945                field email: text [indexed, stored]
946            }
947        "#;
948
949        let indexes = parse_sdl(sdl).unwrap();
950        assert_eq!(indexes.len(), 2);
951        assert_eq!(indexes[0].name, "articles");
952        assert_eq!(indexes[1].name, "users");
953    }
954
955    #[test]
956    fn test_tokenizer_spec() {
957        let sdl = r#"
958            index articles {
959                field title: text<en_stem> [indexed, stored]
960                field body: text<default> [indexed]
961                field author: text [indexed, stored]
962            }
963        "#;
964
965        let indexes = parse_sdl(sdl).unwrap();
966        let index = &indexes[0];
967
968        assert_eq!(index.fields[0].name, "title");
969        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
970
971        assert_eq!(index.fields[1].name, "body");
972        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
973
974        assert_eq!(index.fields[2].name, "author");
975        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
976    }
977
978    #[test]
979    fn test_tokenizer_in_schema() {
980        let sdl = r#"
981            index articles {
982                field title: text<german> [indexed, stored]
983                field body: text<en_stem> [indexed]
984            }
985        "#;
986
987        let indexes = parse_sdl(sdl).unwrap();
988        let schema = indexes[0].to_schema();
989
990        let title_field = schema.get_field("title").unwrap();
991        let title_entry = schema.get_field_entry(title_field).unwrap();
992        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
993
994        let body_field = schema.get_field("body").unwrap();
995        let body_entry = schema.get_field_entry(body_field).unwrap();
996        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
997    }
998
999    #[test]
1000    fn test_query_router_basic() {
1001        let sdl = r#"
1002            index documents {
1003                field title: text [indexed, stored]
1004                field uri: text [indexed, stored]
1005
1006                query_router {
1007                    pattern: "10\\.\\d{4,}/[^\\s]+"
1008                    substitution: "doi://{0}"
1009                    target_field: uris
1010                    mode: exclusive
1011                }
1012            }
1013        "#;
1014
1015        let indexes = parse_sdl(sdl).unwrap();
1016        let index = &indexes[0];
1017
1018        assert_eq!(index.query_routers.len(), 1);
1019        let router = &index.query_routers[0];
1020        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1021        assert_eq!(router.substitution, "doi://{0}");
1022        assert_eq!(router.target_field, "uris");
1023        assert_eq!(router.mode, RoutingMode::Exclusive);
1024    }
1025
1026    #[test]
1027    fn test_query_router_raw_string() {
1028        let sdl = r#"
1029            index documents {
1030                field uris: text [indexed, stored]
1031
1032                query_router {
1033                    pattern: r"^pmid:(\d+)$"
1034                    substitution: "pubmed://{1}"
1035                    target_field: uris
1036                    mode: additional
1037                }
1038            }
1039        "#;
1040
1041        let indexes = parse_sdl(sdl).unwrap();
1042        let router = &indexes[0].query_routers[0];
1043
1044        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1045        assert_eq!(router.substitution, "pubmed://{1}");
1046        assert_eq!(router.mode, RoutingMode::Additional);
1047    }
1048
1049    #[test]
1050    fn test_multiple_query_routers() {
1051        let sdl = r#"
1052            index documents {
1053                field uris: text [indexed, stored]
1054
1055                query_router {
1056                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1057                    substitution: "doi://{1}"
1058                    target_field: uris
1059                    mode: exclusive
1060                }
1061
1062                query_router {
1063                    pattern: r"^pmid:(\d+)$"
1064                    substitution: "pubmed://{1}"
1065                    target_field: uris
1066                    mode: exclusive
1067                }
1068
1069                query_router {
1070                    pattern: r"^arxiv:(\d+\.\d+)$"
1071                    substitution: "arxiv://{1}"
1072                    target_field: uris
1073                    mode: additional
1074                }
1075            }
1076        "#;
1077
1078        let indexes = parse_sdl(sdl).unwrap();
1079        assert_eq!(indexes[0].query_routers.len(), 3);
1080    }
1081
1082    #[test]
1083    fn test_query_router_default_mode() {
1084        let sdl = r#"
1085            index documents {
1086                field uris: text [indexed, stored]
1087
1088                query_router {
1089                    pattern: r"test"
1090                    substitution: "{0}"
1091                    target_field: uris
1092                }
1093            }
1094        "#;
1095
1096        let indexes = parse_sdl(sdl).unwrap();
1097        // Default mode should be Additional
1098        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1099    }
1100
1101    #[test]
1102    fn test_multi_attribute() {
1103        let sdl = r#"
1104            index documents {
1105                field uris: text [indexed, stored<multi>]
1106                field title: text [indexed, stored]
1107            }
1108        "#;
1109
1110        let indexes = parse_sdl(sdl).unwrap();
1111        assert_eq!(indexes.len(), 1);
1112
1113        let fields = &indexes[0].fields;
1114        assert_eq!(fields.len(), 2);
1115
1116        // uris should have multi=true
1117        assert_eq!(fields[0].name, "uris");
1118        assert!(fields[0].multi, "uris field should have multi=true");
1119
1120        // title should have multi=false
1121        assert_eq!(fields[1].name, "title");
1122        assert!(!fields[1].multi, "title field should have multi=false");
1123
1124        // Verify schema conversion preserves multi attribute
1125        let schema = indexes[0].to_schema();
1126        let uris_field = schema.get_field("uris").unwrap();
1127        let title_field = schema.get_field("title").unwrap();
1128
1129        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1130        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1131    }
1132
1133    #[test]
1134    fn test_sparse_vector_field() {
1135        let sdl = r#"
1136            index documents {
1137                field embedding: sparse_vector [indexed, stored]
1138            }
1139        "#;
1140
1141        let indexes = parse_sdl(sdl).unwrap();
1142        assert_eq!(indexes.len(), 1);
1143        assert_eq!(indexes[0].fields.len(), 1);
1144        assert_eq!(indexes[0].fields[0].name, "embedding");
1145        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1146        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1147    }
1148
1149    #[test]
1150    fn test_sparse_vector_with_config() {
1151        let sdl = r#"
1152            index documents {
1153                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1154                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1155            }
1156        "#;
1157
1158        let indexes = parse_sdl(sdl).unwrap();
1159        assert_eq!(indexes[0].fields.len(), 2);
1160
1161        // First field: u16 indices, uint8 quantization
1162        let f1 = &indexes[0].fields[0];
1163        assert_eq!(f1.name, "embedding");
1164        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1165        assert_eq!(config1.index_size, IndexSize::U16);
1166        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1167
1168        // Second field: u32 indices, float32 quantization
1169        let f2 = &indexes[0].fields[1];
1170        assert_eq!(f2.name, "dense");
1171        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1172        assert_eq!(config2.index_size, IndexSize::U32);
1173        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1174    }
1175
1176    #[test]
1177    fn test_sparse_vector_with_weight_threshold() {
1178        let sdl = r#"
1179            index documents {
1180                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1181                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1182            }
1183        "#;
1184
1185        let indexes = parse_sdl(sdl).unwrap();
1186        assert_eq!(indexes[0].fields.len(), 2);
1187
1188        // First field: u16 indices, uint8 quantization, threshold 0.1
1189        let f1 = &indexes[0].fields[0];
1190        assert_eq!(f1.name, "embedding");
1191        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1192        assert_eq!(config1.index_size, IndexSize::U16);
1193        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1194        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1195
1196        // Second field: u32 indices, float16 quantization, threshold 0.05
1197        let f2 = &indexes[0].fields[1];
1198        assert_eq!(f2.name, "embedding2");
1199        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1200        assert_eq!(config2.index_size, IndexSize::U32);
1201        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1202        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1203    }
1204
1205    #[test]
1206    fn test_sparse_vector_with_pruning() {
1207        let sdl = r#"
1208            index documents {
1209                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1210            }
1211        "#;
1212
1213        let indexes = parse_sdl(sdl).unwrap();
1214        let f = &indexes[0].fields[0];
1215        assert_eq!(f.name, "embedding");
1216        let config = f.sparse_vector_config.as_ref().unwrap();
1217        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1218        assert_eq!(config.posting_list_pruning, Some(0.1));
1219    }
1220
1221    #[test]
1222    fn test_dense_vector_field() {
1223        let sdl = r#"
1224            index documents {
1225                field embedding: dense_vector<768> [indexed, stored]
1226            }
1227        "#;
1228
1229        let indexes = parse_sdl(sdl).unwrap();
1230        assert_eq!(indexes.len(), 1);
1231        assert_eq!(indexes[0].fields.len(), 1);
1232
1233        let f = &indexes[0].fields[0];
1234        assert_eq!(f.name, "embedding");
1235        assert_eq!(f.field_type, FieldType::DenseVector);
1236
1237        let config = f.dense_vector_config.as_ref().unwrap();
1238        assert_eq!(config.dim, 768);
1239    }
1240
1241    #[test]
1242    fn test_dense_vector_alias() {
1243        let sdl = r#"
1244            index documents {
1245                field embedding: vector<1536> [indexed]
1246            }
1247        "#;
1248
1249        let indexes = parse_sdl(sdl).unwrap();
1250        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1251        assert_eq!(
1252            indexes[0].fields[0]
1253                .dense_vector_config
1254                .as_ref()
1255                .unwrap()
1256                .dim,
1257            1536
1258        );
1259    }
1260
1261    #[test]
1262    fn test_dense_vector_with_num_clusters() {
1263        let sdl = r#"
1264            index documents {
1265                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1266            }
1267        "#;
1268
1269        let indexes = parse_sdl(sdl).unwrap();
1270        assert_eq!(indexes.len(), 1);
1271
1272        let f = &indexes[0].fields[0];
1273        assert_eq!(f.name, "embedding");
1274        assert_eq!(f.field_type, FieldType::DenseVector);
1275
1276        let config = f.dense_vector_config.as_ref().unwrap();
1277        assert_eq!(config.dim, 768);
1278        assert_eq!(config.num_clusters, Some(256));
1279        assert_eq!(config.nprobe, 32); // default
1280    }
1281
1282    #[test]
1283    fn test_dense_vector_with_num_clusters_and_nprobe() {
1284        let sdl = r#"
1285            index documents {
1286                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1287            }
1288        "#;
1289
1290        let indexes = parse_sdl(sdl).unwrap();
1291        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1292
1293        assert_eq!(config.dim, 1536);
1294        assert_eq!(config.num_clusters, Some(512));
1295        assert_eq!(config.nprobe, 64);
1296    }
1297
1298    #[test]
1299    fn test_dense_vector_keyword_syntax() {
1300        let sdl = r#"
1301            index documents {
1302                field embedding: dense_vector<dims: 1536> [indexed, stored]
1303            }
1304        "#;
1305
1306        let indexes = parse_sdl(sdl).unwrap();
1307        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1308
1309        assert_eq!(config.dim, 1536);
1310        assert!(config.num_clusters.is_none());
1311    }
1312
1313    #[test]
1314    fn test_dense_vector_keyword_syntax_full() {
1315        let sdl = r#"
1316            index documents {
1317                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1318            }
1319        "#;
1320
1321        let indexes = parse_sdl(sdl).unwrap();
1322        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1323
1324        assert_eq!(config.dim, 1536);
1325        assert_eq!(config.num_clusters, Some(256));
1326        assert_eq!(config.nprobe, 64);
1327    }
1328
1329    #[test]
1330    fn test_dense_vector_keyword_syntax_partial() {
1331        let sdl = r#"
1332            index documents {
1333                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1334            }
1335        "#;
1336
1337        let indexes = parse_sdl(sdl).unwrap();
1338        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1339
1340        assert_eq!(config.dim, 768);
1341        assert_eq!(config.num_clusters, Some(128));
1342        assert_eq!(config.nprobe, 32); // default
1343    }
1344
1345    #[test]
1346    fn test_dense_vector_scann_index() {
1347        use crate::dsl::schema::VectorIndexType;
1348
1349        let sdl = r#"
1350            index documents {
1351                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1352            }
1353        "#;
1354
1355        let indexes = parse_sdl(sdl).unwrap();
1356        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1357
1358        assert_eq!(config.dim, 768);
1359        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1360        assert_eq!(config.num_clusters, Some(256));
1361        assert_eq!(config.nprobe, 64);
1362    }
1363
1364    #[test]
1365    fn test_dense_vector_ivf_rabitq_index() {
1366        use crate::dsl::schema::VectorIndexType;
1367
1368        let sdl = r#"
1369            index documents {
1370                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1371            }
1372        "#;
1373
1374        let indexes = parse_sdl(sdl).unwrap();
1375        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1376
1377        assert_eq!(config.dim, 1536);
1378        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1379        assert_eq!(config.num_clusters, Some(512));
1380    }
1381
1382    #[test]
1383    fn test_dense_vector_rabitq_no_clusters() {
1384        use crate::dsl::schema::VectorIndexType;
1385
1386        let sdl = r#"
1387            index documents {
1388                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1389            }
1390        "#;
1391
1392        let indexes = parse_sdl(sdl).unwrap();
1393        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1394
1395        assert_eq!(config.dim, 768);
1396        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1397        assert!(config.num_clusters.is_none());
1398    }
1399
1400    #[test]
1401    fn test_dense_vector_flat_index() {
1402        use crate::dsl::schema::VectorIndexType;
1403
1404        let sdl = r#"
1405            index documents {
1406                field embedding: dense_vector<dims: 768> [indexed<flat>]
1407            }
1408        "#;
1409
1410        let indexes = parse_sdl(sdl).unwrap();
1411        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1412
1413        assert_eq!(config.dim, 768);
1414        assert_eq!(config.index_type, VectorIndexType::Flat);
1415    }
1416
1417    #[test]
1418    fn test_dense_vector_default_index_type() {
1419        use crate::dsl::schema::VectorIndexType;
1420
1421        // When no index type specified, should default to RaBitQ (basic)
1422        let sdl = r#"
1423            index documents {
1424                field embedding: dense_vector<dims: 768> [indexed]
1425            }
1426        "#;
1427
1428        let indexes = parse_sdl(sdl).unwrap();
1429        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1430
1431        assert_eq!(config.dim, 768);
1432        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1433    }
1434
1435    #[test]
1436    fn test_dense_vector_mrl_dim() {
1437        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1438        let sdl = r#"
1439            index documents {
1440                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1441            }
1442        "#;
1443
1444        let indexes = parse_sdl(sdl).unwrap();
1445        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1446
1447        assert_eq!(config.dim, 1536);
1448        assert_eq!(config.mrl_dim, Some(256));
1449        assert_eq!(config.index_dim(), 256);
1450    }
1451
1452    #[test]
1453    fn test_dense_vector_mrl_dim_with_num_clusters() {
1454        // Test mrl_dim combined with other index options
1455        let sdl = r#"
1456            index documents {
1457                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1458            }
1459        "#;
1460
1461        let indexes = parse_sdl(sdl).unwrap();
1462        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1463
1464        assert_eq!(config.dim, 768);
1465        assert_eq!(config.mrl_dim, Some(128));
1466        assert_eq!(config.index_dim(), 128);
1467        assert_eq!(config.num_clusters, Some(256));
1468        assert_eq!(config.nprobe, 64);
1469    }
1470
1471    #[test]
1472    fn test_dense_vector_no_mrl_dim() {
1473        // Test that index_dim() returns full dim when mrl_dim is not set
1474        let sdl = r#"
1475            index documents {
1476                field embedding: dense_vector<dims: 768> [indexed]
1477            }
1478        "#;
1479
1480        let indexes = parse_sdl(sdl).unwrap();
1481        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1482
1483        assert_eq!(config.dim, 768);
1484        assert_eq!(config.mrl_dim, None);
1485        assert_eq!(config.index_dim(), 768);
1486    }
1487
1488    #[test]
1489    fn test_json_field_type() {
1490        let sdl = r#"
1491            index documents {
1492                field title: text [indexed, stored]
1493                field metadata: json [stored]
1494                field extra: json
1495            }
1496        "#;
1497
1498        let indexes = parse_sdl(sdl).unwrap();
1499        let index = &indexes[0];
1500
1501        assert_eq!(index.fields.len(), 3);
1502
1503        // Check JSON field
1504        assert_eq!(index.fields[1].name, "metadata");
1505        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1506        assert!(index.fields[1].stored);
1507        // JSON fields should not be indexed (enforced by add_json_field)
1508
1509        // Check default attributes for JSON field
1510        assert_eq!(index.fields[2].name, "extra");
1511        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1512
1513        // Verify schema conversion
1514        let schema = index.to_schema();
1515        let metadata_field = schema.get_field("metadata").unwrap();
1516        let entry = schema.get_field_entry(metadata_field).unwrap();
1517        assert_eq!(entry.field_type, FieldType::Json);
1518        assert!(!entry.indexed); // JSON fields are never indexed
1519        assert!(entry.stored);
1520    }
1521
1522    #[test]
1523    fn test_sparse_vector_query_config() {
1524        use crate::structures::QueryWeighting;
1525
1526        let sdl = r#"
1527            index documents {
1528                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1529            }
1530        "#;
1531
1532        let indexes = parse_sdl(sdl).unwrap();
1533        let index = &indexes[0];
1534
1535        assert_eq!(index.fields.len(), 1);
1536        assert_eq!(index.fields[0].name, "embedding");
1537        assert!(matches!(
1538            index.fields[0].field_type,
1539            FieldType::SparseVector
1540        ));
1541
1542        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1543        assert_eq!(config.index_size, IndexSize::U16);
1544        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1545
1546        // Check query config
1547        let query_config = config.query_config.as_ref().unwrap();
1548        assert_eq!(
1549            query_config.tokenizer.as_deref(),
1550            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1551        );
1552        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1553
1554        // Verify schema conversion preserves query config
1555        let schema = index.to_schema();
1556        let embedding_field = schema.get_field("embedding").unwrap();
1557        let entry = schema.get_field_entry(embedding_field).unwrap();
1558        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1559        let qc = sv_config.query_config.as_ref().unwrap();
1560        assert_eq!(
1561            qc.tokenizer.as_deref(),
1562            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1563        );
1564        assert_eq!(qc.weighting, QueryWeighting::Idf);
1565    }
1566
1567    #[test]
1568    fn test_sparse_vector_query_config_weighting_one() {
1569        use crate::structures::QueryWeighting;
1570
1571        let sdl = r#"
1572            index documents {
1573                field embedding: sparse_vector [indexed<query<weighting: one>>]
1574            }
1575        "#;
1576
1577        let indexes = parse_sdl(sdl).unwrap();
1578        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1579
1580        let query_config = config.query_config.as_ref().unwrap();
1581        assert!(query_config.tokenizer.is_none());
1582        assert_eq!(query_config.weighting, QueryWeighting::One);
1583    }
1584
1585    #[test]
1586    fn test_sparse_vector_query_config_weighting_idf_file() {
1587        use crate::structures::QueryWeighting;
1588
1589        let sdl = r#"
1590            index documents {
1591                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1592            }
1593        "#;
1594
1595        let indexes = parse_sdl(sdl).unwrap();
1596        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1597
1598        let query_config = config.query_config.as_ref().unwrap();
1599        assert_eq!(
1600            query_config.tokenizer.as_deref(),
1601            Some("opensearch-neural-sparse-encoding-v1")
1602        );
1603        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1604
1605        // Verify schema conversion preserves idf_file
1606        let schema = indexes[0].to_schema();
1607        let field = schema.get_field("embedding").unwrap();
1608        let entry = schema.get_field_entry(field).unwrap();
1609        let sc = entry.sparse_vector_config.as_ref().unwrap();
1610        let qc = sc.query_config.as_ref().unwrap();
1611        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1612    }
1613}