Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65/// Parsed field definition
66#[derive(Debug, Clone)]
67pub struct FieldDef {
68    pub name: String,
69    pub field_type: FieldType,
70    pub indexed: bool,
71    pub stored: bool,
72    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
73    pub tokenizer: Option<String>,
74    /// Whether this field can have multiple values (serialized as array in JSON)
75    pub multi: bool,
76    /// Position tracking mode for phrase queries and multi-field element tracking
77    pub positions: Option<super::schema::PositionMode>,
78    /// Configuration for sparse vector fields
79    pub sparse_vector_config: Option<SparseVectorConfig>,
80    /// Configuration for dense vector fields
81    pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84/// Parsed index definition
85#[derive(Debug, Clone)]
86pub struct IndexDef {
87    pub name: String,
88    pub fields: Vec<FieldDef>,
89    pub default_fields: Vec<String>,
90    /// Query router rules for routing queries to specific fields
91    pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95    /// Convert to a Schema
96    pub fn to_schema(&self) -> Schema {
97        let mut builder = SchemaBuilder::default();
98
99        for field in &self.fields {
100            let f = match field.field_type {
101                FieldType::Text => {
102                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103                    builder.add_text_field_with_tokenizer(
104                        &field.name,
105                        field.indexed,
106                        field.stored,
107                        tokenizer,
108                    )
109                }
110                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114                FieldType::Json => builder.add_json_field(&field.name, field.stored),
115                FieldType::SparseVector => {
116                    if let Some(config) = &field.sparse_vector_config {
117                        builder.add_sparse_vector_field_with_config(
118                            &field.name,
119                            field.indexed,
120                            field.stored,
121                            config.clone(),
122                        )
123                    } else {
124                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125                    }
126                }
127                FieldType::DenseVector => {
128                    // Dense vector dimension must be specified via config
129                    let config = field
130                        .dense_vector_config
131                        .as_ref()
132                        .expect("DenseVector field requires dimension to be specified");
133                    builder.add_dense_vector_field_with_config(
134                        &field.name,
135                        field.indexed,
136                        field.stored,
137                        config.clone(),
138                    )
139                }
140            };
141            if field.multi {
142                builder.set_multi(f, true);
143            }
144            // Set positions: explicit > auto (ordinal for multi vectors)
145            let positions = field.positions.or({
146                // Auto-set ordinal positions for multi-valued vector fields
147                if field.multi
148                    && matches!(
149                        field.field_type,
150                        FieldType::SparseVector | FieldType::DenseVector
151                    )
152                {
153                    Some(super::schema::PositionMode::Ordinal)
154                } else {
155                    None
156                }
157            });
158            if let Some(mode) = positions {
159                builder.set_positions(f, mode);
160            }
161        }
162
163        // Set default fields if specified
164        if !self.default_fields.is_empty() {
165            builder.set_default_fields(self.default_fields.clone());
166        }
167
168        // Set query routers if specified
169        if !self.query_routers.is_empty() {
170            builder.set_query_routers(self.query_routers.clone());
171        }
172
173        builder.build()
174    }
175
176    /// Create a QueryFieldRouter from the query router rules
177    ///
178    /// Returns None if there are no query router rules defined.
179    /// Returns Err if any regex pattern is invalid.
180    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181        if self.query_routers.is_empty() {
182            return Ok(None);
183        }
184
185        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186            .map(Some)
187            .map_err(Error::Schema)
188    }
189}
190
191/// Parse field type from string
192fn parse_field_type(type_str: &str) -> Result<FieldType> {
193    match type_str {
194        "text" | "string" | "str" => Ok(FieldType::Text),
195        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196        "i64" | "int" | "integer" => Ok(FieldType::I64),
197        "f64" | "float" | "double" => Ok(FieldType::F64),
198        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199        "json" => Ok(FieldType::Json),
200        "sparse_vector" => Ok(FieldType::SparseVector),
201        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203    }
204}
205
206/// Index configuration parsed from indexed<...> attribute
207#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209    index_type: Option<super::schema::VectorIndexType>,
210    num_clusters: Option<usize>,
211    nprobe: Option<usize>,
212    mrl_dim: Option<usize>,
213    build_threshold: Option<usize>,
214    // Sparse vector index params
215    quantization: Option<WeightQuantization>,
216    weight_threshold: Option<f32>,
217    block_size: Option<usize>,
218    // Sparse vector query-time config
219    query_tokenizer: Option<String>,
220    query_weighting: Option<QueryWeighting>,
221    // Position tracking mode for phrase queries
222    positions: Option<super::schema::PositionMode>,
223}
224
225/// Parse attributes from pest pair
226/// Returns (indexed, stored, multi, index_config)
227/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
228/// multi is now inside stored<multi>
229fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
230    let mut indexed = false;
231    let mut stored = false;
232    let mut multi = false;
233    let mut index_config = None;
234
235    for attr in pair.into_inner() {
236        if attr.as_rule() == Rule::attribute {
237            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
238            let mut found_config = false;
239            for inner in attr.clone().into_inner() {
240                match inner.as_rule() {
241                    Rule::indexed_with_config => {
242                        indexed = true;
243                        index_config = Some(parse_index_config(inner));
244                        found_config = true;
245                        break;
246                    }
247                    Rule::stored_with_config => {
248                        stored = true;
249                        multi = true; // stored<multi>
250                        found_config = true;
251                        break;
252                    }
253                    _ => {}
254                }
255            }
256            if !found_config {
257                // Simple attribute
258                match attr.as_str() {
259                    "indexed" => indexed = true,
260                    "stored" => stored = true,
261                    _ => {}
262                }
263            }
264        }
265    }
266
267    (indexed, stored, multi, index_config)
268}
269
270/// Parse index configuration from indexed<...> attribute
271fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
272    let mut config = IndexConfig::default();
273
274    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
275    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
276    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
277
278    for inner in pair.into_inner() {
279        if inner.as_rule() == Rule::index_config_params {
280            for param in inner.into_inner() {
281                if param.as_rule() == Rule::index_config_param {
282                    for p in param.into_inner() {
283                        parse_single_index_config_param(&mut config, p);
284                    }
285                }
286            }
287        }
288    }
289
290    config
291}
292
293/// Parse a single index config parameter
294fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
295    use super::schema::VectorIndexType;
296
297    match p.as_rule() {
298        Rule::index_type_spec => {
299            config.index_type = Some(match p.as_str() {
300                "flat" => VectorIndexType::Flat,
301                "rabitq" => VectorIndexType::RaBitQ,
302                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
303                "scann" => VectorIndexType::ScaNN,
304                _ => VectorIndexType::RaBitQ,
305            });
306        }
307        Rule::index_type_kwarg => {
308            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
309            if let Some(t) = p.into_inner().next() {
310                config.index_type = Some(match t.as_str() {
311                    "flat" => VectorIndexType::Flat,
312                    "rabitq" => VectorIndexType::RaBitQ,
313                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
314                    "scann" => VectorIndexType::ScaNN,
315                    _ => VectorIndexType::RaBitQ,
316                });
317            }
318        }
319        Rule::num_clusters_kwarg => {
320            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
321            if let Some(n) = p.into_inner().next() {
322                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
323            }
324        }
325        Rule::build_threshold_kwarg => {
326            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
327            if let Some(n) = p.into_inner().next() {
328                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
329            }
330        }
331        Rule::nprobe_kwarg => {
332            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
333            if let Some(n) = p.into_inner().next() {
334                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
335            }
336        }
337        Rule::mrl_dim_kwarg => {
338            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
339            if let Some(n) = p.into_inner().next() {
340                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
341            }
342        }
343        Rule::quantization_kwarg => {
344            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
345            if let Some(q) = p.into_inner().next() {
346                config.quantization = Some(match q.as_str() {
347                    "float32" | "f32" => WeightQuantization::Float32,
348                    "float16" | "f16" => WeightQuantization::Float16,
349                    "uint8" | "u8" => WeightQuantization::UInt8,
350                    "uint4" | "u4" => WeightQuantization::UInt4,
351                    _ => WeightQuantization::default(),
352                });
353            }
354        }
355        Rule::weight_threshold_kwarg => {
356            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
357            if let Some(t) = p.into_inner().next() {
358                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
359            }
360        }
361        Rule::block_size_kwarg => {
362            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
363            if let Some(n) = p.into_inner().next() {
364                config.block_size = Some(n.as_str().parse().unwrap_or(128));
365            }
366        }
367        Rule::query_config_block => {
368            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
369            parse_query_config_block(config, p);
370        }
371        Rule::positions_kwarg => {
372            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
373            use super::schema::PositionMode;
374            config.positions = Some(match p.as_str() {
375                "ordinal" => PositionMode::Ordinal,
376                "token_position" => PositionMode::TokenPosition,
377                _ => PositionMode::Full, // "positions" or any other value defaults to Full
378            });
379        }
380        _ => {}
381    }
382}
383
384/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
385fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
386    for inner in pair.into_inner() {
387        if inner.as_rule() == Rule::query_config_params {
388            for param in inner.into_inner() {
389                if param.as_rule() == Rule::query_config_param {
390                    for p in param.into_inner() {
391                        match p.as_rule() {
392                            Rule::query_tokenizer_kwarg => {
393                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
394                                if let Some(path) = p.into_inner().next()
395                                    && let Some(inner_path) = path.into_inner().next()
396                                {
397                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
398                                }
399                            }
400                            Rule::query_weighting_kwarg => {
401                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
402                                if let Some(w) = p.into_inner().next() {
403                                    config.query_weighting = Some(match w.as_str() {
404                                        "one" => QueryWeighting::One,
405                                        "idf" => QueryWeighting::Idf,
406                                        _ => QueryWeighting::One,
407                                    });
408                                }
409                            }
410                            _ => {}
411                        }
412                    }
413                }
414            }
415        }
416    }
417}
418
419/// Parse a field definition from pest pair
420fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
421    let mut inner = pair.into_inner();
422
423    let name = inner
424        .next()
425        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
426        .as_str()
427        .to_string();
428
429    let field_type_str = inner
430        .next()
431        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
432        .as_str();
433
434    let field_type = parse_field_type(field_type_str)?;
435
436    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
437    let mut tokenizer = None;
438    let mut sparse_vector_config = None;
439    let mut dense_vector_config = None;
440    let mut indexed = true;
441    let mut stored = true;
442    let mut multi = false;
443    let mut index_config: Option<IndexConfig> = None;
444
445    for item in inner {
446        match item.as_rule() {
447            Rule::tokenizer_spec => {
448                // Extract tokenizer name from <name>
449                if let Some(tok_name) = item.into_inner().next() {
450                    tokenizer = Some(tok_name.as_str().to_string());
451                }
452            }
453            Rule::sparse_vector_config => {
454                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
455                sparse_vector_config = Some(parse_sparse_vector_config(item));
456            }
457            Rule::dense_vector_config => {
458                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
459                dense_vector_config = Some(parse_dense_vector_config(item));
460            }
461            Rule::attributes => {
462                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
463                indexed = idx;
464                stored = sto;
465                multi = mul;
466                index_config = idx_cfg;
467            }
468            _ => {}
469        }
470    }
471
472    // Merge index config into vector configs if both exist
473    let mut positions = None;
474    if let Some(idx_cfg) = index_config {
475        positions = idx_cfg.positions;
476        if let Some(ref mut dv_config) = dense_vector_config {
477            apply_index_config_to_dense_vector(dv_config, idx_cfg);
478        } else if field_type == FieldType::SparseVector {
479            // For sparse vectors, create default config if not present and apply index params
480            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
481            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
482        }
483    }
484
485    Ok(FieldDef {
486        name,
487        field_type,
488        indexed,
489        stored,
490        tokenizer,
491        multi,
492        positions,
493        sparse_vector_config,
494        dense_vector_config,
495    })
496}
497
498/// Apply index configuration from indexed<...> to DenseVectorConfig
499fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
500    // Apply index type if specified
501    if let Some(index_type) = idx_cfg.index_type {
502        config.index_type = index_type;
503    }
504
505    // Apply num_clusters for IVF-based indexes
506    if idx_cfg.num_clusters.is_some() {
507        config.num_clusters = idx_cfg.num_clusters;
508    }
509
510    // Apply nprobe if specified
511    if let Some(nprobe) = idx_cfg.nprobe {
512        config.nprobe = nprobe;
513    }
514
515    // Apply mrl_dim if specified
516    if idx_cfg.mrl_dim.is_some() {
517        config.mrl_dim = idx_cfg.mrl_dim;
518    }
519
520    // Apply build_threshold if specified
521    if idx_cfg.build_threshold.is_some() {
522        config.build_threshold = idx_cfg.build_threshold;
523    }
524}
525
526/// Parse sparse_vector_config - only index_size (positional)
527/// Example: <u16> or <u32>
528fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
529    let mut index_size = IndexSize::default();
530
531    // Parse positional index_size_spec
532    for inner in pair.into_inner() {
533        if inner.as_rule() == Rule::index_size_spec {
534            index_size = match inner.as_str() {
535                "u16" => IndexSize::U16,
536                "u32" => IndexSize::U32,
537                _ => IndexSize::default(),
538            };
539        }
540    }
541
542    SparseVectorConfig {
543        index_size,
544        weight_quantization: WeightQuantization::default(),
545        weight_threshold: 0.0,
546        block_size: 128,
547        posting_list_pruning: None,
548        query_config: None,
549    }
550}
551
552/// Apply index configuration from indexed<...> to SparseVectorConfig
553fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
554    if let Some(q) = idx_cfg.quantization {
555        config.weight_quantization = q;
556    }
557    if let Some(t) = idx_cfg.weight_threshold {
558        config.weight_threshold = t;
559    }
560    if let Some(bs) = idx_cfg.block_size {
561        config.block_size = bs.next_power_of_two();
562    }
563    // Apply query-time configuration if present
564    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
565        let query_config = config
566            .query_config
567            .get_or_insert(SparseQueryConfig::default());
568        if let Some(tokenizer) = idx_cfg.query_tokenizer {
569            query_config.tokenizer = Some(tokenizer);
570        }
571        if let Some(weighting) = idx_cfg.query_weighting {
572            query_config.weighting = weighting;
573        }
574    }
575}
576
577/// Parse dense_vector_config - only dims
578/// All index-related params (including mrl_dim) are now in indexed<...> attribute
579fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
580    let mut dim: usize = 0;
581
582    // Navigate to dense_vector_params
583    for params in pair.into_inner() {
584        if params.as_rule() == Rule::dense_vector_params {
585            for inner in params.into_inner() {
586                match inner.as_rule() {
587                    Rule::dense_vector_keyword_params => {
588                        // Parse keyword args: dims: N
589                        for kwarg in inner.into_inner() {
590                            if kwarg.as_rule() == Rule::dims_kwarg
591                                && let Some(d) = kwarg.into_inner().next()
592                            {
593                                dim = d.as_str().parse().unwrap_or(0);
594                            }
595                        }
596                    }
597                    Rule::dense_vector_positional_params => {
598                        // Parse positional: just dimension
599                        if let Some(dim_pair) = inner.into_inner().next() {
600                            dim = dim_pair.as_str().parse().unwrap_or(0);
601                        }
602                    }
603                    _ => {}
604                }
605            }
606        }
607    }
608
609    DenseVectorConfig::new(dim)
610}
611
612/// Parse default_fields definition
613fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
614    pair.into_inner().map(|p| p.as_str().to_string()).collect()
615}
616
617/// Parse a query router definition
618fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
619    let mut pattern = String::new();
620    let mut substitution = String::new();
621    let mut target_field = String::new();
622    let mut mode = RoutingMode::Additional;
623
624    for prop in pair.into_inner() {
625        if prop.as_rule() != Rule::query_router_prop {
626            continue;
627        }
628
629        for inner in prop.into_inner() {
630            match inner.as_rule() {
631                Rule::query_router_pattern => {
632                    if let Some(regex_str) = inner.into_inner().next() {
633                        pattern = parse_string_value(regex_str);
634                    }
635                }
636                Rule::query_router_substitution => {
637                    if let Some(quoted) = inner.into_inner().next() {
638                        substitution = parse_string_value(quoted);
639                    }
640                }
641                Rule::query_router_target => {
642                    if let Some(ident) = inner.into_inner().next() {
643                        target_field = ident.as_str().to_string();
644                    }
645                }
646                Rule::query_router_mode => {
647                    if let Some(mode_val) = inner.into_inner().next() {
648                        mode = match mode_val.as_str() {
649                            "exclusive" => RoutingMode::Exclusive,
650                            "additional" => RoutingMode::Additional,
651                            _ => RoutingMode::Additional,
652                        };
653                    }
654                }
655                _ => {}
656            }
657        }
658    }
659
660    if pattern.is_empty() {
661        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
662    }
663    if substitution.is_empty() {
664        return Err(Error::Schema(
665            "query_router missing 'substitution'".to_string(),
666        ));
667    }
668    if target_field.is_empty() {
669        return Err(Error::Schema(
670            "query_router missing 'target_field'".to_string(),
671        ));
672    }
673
674    Ok(QueryRouterRule {
675        pattern,
676        substitution,
677        target_field,
678        mode,
679    })
680}
681
682/// Parse a string value from quoted_string, raw_string, or regex_string
683fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
684    let s = pair.as_str();
685    match pair.as_rule() {
686        Rule::regex_string => {
687            // regex_string contains either raw_string or quoted_string
688            if let Some(inner) = pair.into_inner().next() {
689                parse_string_value(inner)
690            } else {
691                s.to_string()
692            }
693        }
694        Rule::raw_string => {
695            // r"..." - strip r" prefix and " suffix
696            s[2..s.len() - 1].to_string()
697        }
698        Rule::quoted_string => {
699            // "..." - strip quotes and handle escapes
700            let inner = &s[1..s.len() - 1];
701            // Simple escape handling
702            inner
703                .replace("\\n", "\n")
704                .replace("\\t", "\t")
705                .replace("\\\"", "\"")
706                .replace("\\\\", "\\")
707        }
708        _ => s.to_string(),
709    }
710}
711
712/// Parse an index definition from pest pair
713fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
714    let mut inner = pair.into_inner();
715
716    let name = inner
717        .next()
718        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
719        .as_str()
720        .to_string();
721
722    let mut fields = Vec::new();
723    let mut default_fields = Vec::new();
724    let mut query_routers = Vec::new();
725
726    for item in inner {
727        match item.as_rule() {
728            Rule::field_def => {
729                fields.push(parse_field_def(item)?);
730            }
731            Rule::default_fields_def => {
732                default_fields = parse_default_fields_def(item);
733            }
734            Rule::query_router_def => {
735                query_routers.push(parse_query_router_def(item)?);
736            }
737            _ => {}
738        }
739    }
740
741    Ok(IndexDef {
742        name,
743        fields,
744        default_fields,
745        query_routers,
746    })
747}
748
749/// Parse SDL from a string
750pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
751    let pairs = SdlParser::parse(Rule::file, input)
752        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
753
754    let mut indexes = Vec::new();
755
756    for pair in pairs {
757        if pair.as_rule() == Rule::file {
758            for inner in pair.into_inner() {
759                if inner.as_rule() == Rule::index_def {
760                    indexes.push(parse_index_def(inner)?);
761                }
762            }
763        }
764    }
765
766    Ok(indexes)
767}
768
769/// Parse SDL and return a single index definition
770pub fn parse_single_index(input: &str) -> Result<IndexDef> {
771    let indexes = parse_sdl(input)?;
772
773    if indexes.is_empty() {
774        return Err(Error::Schema("No index definition found".to_string()));
775    }
776
777    if indexes.len() > 1 {
778        return Err(Error::Schema(
779            "Multiple index definitions found, expected one".to_string(),
780        ));
781    }
782
783    Ok(indexes.into_iter().next().unwrap())
784}
785
786#[cfg(test)]
787mod tests {
788    use super::*;
789
790    #[test]
791    fn test_parse_simple_schema() {
792        let sdl = r#"
793            index articles {
794                field title: text [indexed, stored]
795                field body: text [indexed]
796            }
797        "#;
798
799        let indexes = parse_sdl(sdl).unwrap();
800        assert_eq!(indexes.len(), 1);
801
802        let index = &indexes[0];
803        assert_eq!(index.name, "articles");
804        assert_eq!(index.fields.len(), 2);
805
806        assert_eq!(index.fields[0].name, "title");
807        assert!(matches!(index.fields[0].field_type, FieldType::Text));
808        assert!(index.fields[0].indexed);
809        assert!(index.fields[0].stored);
810
811        assert_eq!(index.fields[1].name, "body");
812        assert!(matches!(index.fields[1].field_type, FieldType::Text));
813        assert!(index.fields[1].indexed);
814        assert!(!index.fields[1].stored);
815    }
816
817    #[test]
818    fn test_parse_all_field_types() {
819        let sdl = r#"
820            index test {
821                field text_field: text [indexed, stored]
822                field u64_field: u64 [indexed, stored]
823                field i64_field: i64 [indexed, stored]
824                field f64_field: f64 [indexed, stored]
825                field bytes_field: bytes [stored]
826            }
827        "#;
828
829        let indexes = parse_sdl(sdl).unwrap();
830        let index = &indexes[0];
831
832        assert!(matches!(index.fields[0].field_type, FieldType::Text));
833        assert!(matches!(index.fields[1].field_type, FieldType::U64));
834        assert!(matches!(index.fields[2].field_type, FieldType::I64));
835        assert!(matches!(index.fields[3].field_type, FieldType::F64));
836        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
837    }
838
839    #[test]
840    fn test_parse_with_comments() {
841        let sdl = r#"
842            # This is a comment
843            index articles {
844                # Title field
845                field title: text [indexed, stored]
846                field body: text [indexed] # inline comment not supported yet
847            }
848        "#;
849
850        let indexes = parse_sdl(sdl).unwrap();
851        assert_eq!(indexes[0].fields.len(), 2);
852    }
853
854    #[test]
855    fn test_parse_type_aliases() {
856        let sdl = r#"
857            index test {
858                field a: string [indexed]
859                field b: int [indexed]
860                field c: uint [indexed]
861                field d: float [indexed]
862                field e: binary [stored]
863            }
864        "#;
865
866        let indexes = parse_sdl(sdl).unwrap();
867        let index = &indexes[0];
868
869        assert!(matches!(index.fields[0].field_type, FieldType::Text));
870        assert!(matches!(index.fields[1].field_type, FieldType::I64));
871        assert!(matches!(index.fields[2].field_type, FieldType::U64));
872        assert!(matches!(index.fields[3].field_type, FieldType::F64));
873        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
874    }
875
876    #[test]
877    fn test_to_schema() {
878        let sdl = r#"
879            index articles {
880                field title: text [indexed, stored]
881                field views: u64 [indexed, stored]
882            }
883        "#;
884
885        let indexes = parse_sdl(sdl).unwrap();
886        let schema = indexes[0].to_schema();
887
888        assert!(schema.get_field("title").is_some());
889        assert!(schema.get_field("views").is_some());
890        assert!(schema.get_field("nonexistent").is_none());
891    }
892
893    #[test]
894    fn test_default_attributes() {
895        let sdl = r#"
896            index test {
897                field title: text
898            }
899        "#;
900
901        let indexes = parse_sdl(sdl).unwrap();
902        let field = &indexes[0].fields[0];
903
904        // Default should be indexed and stored
905        assert!(field.indexed);
906        assert!(field.stored);
907    }
908
909    #[test]
910    fn test_multiple_indexes() {
911        let sdl = r#"
912            index articles {
913                field title: text [indexed, stored]
914            }
915
916            index users {
917                field name: text [indexed, stored]
918                field email: text [indexed, stored]
919            }
920        "#;
921
922        let indexes = parse_sdl(sdl).unwrap();
923        assert_eq!(indexes.len(), 2);
924        assert_eq!(indexes[0].name, "articles");
925        assert_eq!(indexes[1].name, "users");
926    }
927
928    #[test]
929    fn test_tokenizer_spec() {
930        let sdl = r#"
931            index articles {
932                field title: text<en_stem> [indexed, stored]
933                field body: text<default> [indexed]
934                field author: text [indexed, stored]
935            }
936        "#;
937
938        let indexes = parse_sdl(sdl).unwrap();
939        let index = &indexes[0];
940
941        assert_eq!(index.fields[0].name, "title");
942        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
943
944        assert_eq!(index.fields[1].name, "body");
945        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
946
947        assert_eq!(index.fields[2].name, "author");
948        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
949    }
950
951    #[test]
952    fn test_tokenizer_in_schema() {
953        let sdl = r#"
954            index articles {
955                field title: text<german> [indexed, stored]
956                field body: text<en_stem> [indexed]
957            }
958        "#;
959
960        let indexes = parse_sdl(sdl).unwrap();
961        let schema = indexes[0].to_schema();
962
963        let title_field = schema.get_field("title").unwrap();
964        let title_entry = schema.get_field_entry(title_field).unwrap();
965        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
966
967        let body_field = schema.get_field("body").unwrap();
968        let body_entry = schema.get_field_entry(body_field).unwrap();
969        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
970    }
971
972    #[test]
973    fn test_query_router_basic() {
974        let sdl = r#"
975            index documents {
976                field title: text [indexed, stored]
977                field uri: text [indexed, stored]
978
979                query_router {
980                    pattern: "10\\.\\d{4,}/[^\\s]+"
981                    substitution: "doi://{0}"
982                    target_field: uris
983                    mode: exclusive
984                }
985            }
986        "#;
987
988        let indexes = parse_sdl(sdl).unwrap();
989        let index = &indexes[0];
990
991        assert_eq!(index.query_routers.len(), 1);
992        let router = &index.query_routers[0];
993        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
994        assert_eq!(router.substitution, "doi://{0}");
995        assert_eq!(router.target_field, "uris");
996        assert_eq!(router.mode, RoutingMode::Exclusive);
997    }
998
999    #[test]
1000    fn test_query_router_raw_string() {
1001        let sdl = r#"
1002            index documents {
1003                field uris: text [indexed, stored]
1004
1005                query_router {
1006                    pattern: r"^pmid:(\d+)$"
1007                    substitution: "pubmed://{1}"
1008                    target_field: uris
1009                    mode: additional
1010                }
1011            }
1012        "#;
1013
1014        let indexes = parse_sdl(sdl).unwrap();
1015        let router = &indexes[0].query_routers[0];
1016
1017        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1018        assert_eq!(router.substitution, "pubmed://{1}");
1019        assert_eq!(router.mode, RoutingMode::Additional);
1020    }
1021
1022    #[test]
1023    fn test_multiple_query_routers() {
1024        let sdl = r#"
1025            index documents {
1026                field uris: text [indexed, stored]
1027
1028                query_router {
1029                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1030                    substitution: "doi://{1}"
1031                    target_field: uris
1032                    mode: exclusive
1033                }
1034
1035                query_router {
1036                    pattern: r"^pmid:(\d+)$"
1037                    substitution: "pubmed://{1}"
1038                    target_field: uris
1039                    mode: exclusive
1040                }
1041
1042                query_router {
1043                    pattern: r"^arxiv:(\d+\.\d+)$"
1044                    substitution: "arxiv://{1}"
1045                    target_field: uris
1046                    mode: additional
1047                }
1048            }
1049        "#;
1050
1051        let indexes = parse_sdl(sdl).unwrap();
1052        assert_eq!(indexes[0].query_routers.len(), 3);
1053    }
1054
1055    #[test]
1056    fn test_query_router_default_mode() {
1057        let sdl = r#"
1058            index documents {
1059                field uris: text [indexed, stored]
1060
1061                query_router {
1062                    pattern: r"test"
1063                    substitution: "{0}"
1064                    target_field: uris
1065                }
1066            }
1067        "#;
1068
1069        let indexes = parse_sdl(sdl).unwrap();
1070        // Default mode should be Additional
1071        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1072    }
1073
1074    #[test]
1075    fn test_multi_attribute() {
1076        let sdl = r#"
1077            index documents {
1078                field uris: text [indexed, stored<multi>]
1079                field title: text [indexed, stored]
1080            }
1081        "#;
1082
1083        let indexes = parse_sdl(sdl).unwrap();
1084        assert_eq!(indexes.len(), 1);
1085
1086        let fields = &indexes[0].fields;
1087        assert_eq!(fields.len(), 2);
1088
1089        // uris should have multi=true
1090        assert_eq!(fields[0].name, "uris");
1091        assert!(fields[0].multi, "uris field should have multi=true");
1092
1093        // title should have multi=false
1094        assert_eq!(fields[1].name, "title");
1095        assert!(!fields[1].multi, "title field should have multi=false");
1096
1097        // Verify schema conversion preserves multi attribute
1098        let schema = indexes[0].to_schema();
1099        let uris_field = schema.get_field("uris").unwrap();
1100        let title_field = schema.get_field("title").unwrap();
1101
1102        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1103        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1104    }
1105
1106    #[test]
1107    fn test_sparse_vector_field() {
1108        let sdl = r#"
1109            index documents {
1110                field embedding: sparse_vector [indexed, stored]
1111            }
1112        "#;
1113
1114        let indexes = parse_sdl(sdl).unwrap();
1115        assert_eq!(indexes.len(), 1);
1116        assert_eq!(indexes[0].fields.len(), 1);
1117        assert_eq!(indexes[0].fields[0].name, "embedding");
1118        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1119        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1120    }
1121
1122    #[test]
1123    fn test_sparse_vector_with_config() {
1124        let sdl = r#"
1125            index documents {
1126                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1127                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1128            }
1129        "#;
1130
1131        let indexes = parse_sdl(sdl).unwrap();
1132        assert_eq!(indexes[0].fields.len(), 2);
1133
1134        // First field: u16 indices, uint8 quantization
1135        let f1 = &indexes[0].fields[0];
1136        assert_eq!(f1.name, "embedding");
1137        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1138        assert_eq!(config1.index_size, IndexSize::U16);
1139        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1140
1141        // Second field: u32 indices, float32 quantization
1142        let f2 = &indexes[0].fields[1];
1143        assert_eq!(f2.name, "dense");
1144        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1145        assert_eq!(config2.index_size, IndexSize::U32);
1146        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1147    }
1148
1149    #[test]
1150    fn test_sparse_vector_with_weight_threshold() {
1151        let sdl = r#"
1152            index documents {
1153                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1154                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1155            }
1156        "#;
1157
1158        let indexes = parse_sdl(sdl).unwrap();
1159        assert_eq!(indexes[0].fields.len(), 2);
1160
1161        // First field: u16 indices, uint8 quantization, threshold 0.1
1162        let f1 = &indexes[0].fields[0];
1163        assert_eq!(f1.name, "embedding");
1164        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1165        assert_eq!(config1.index_size, IndexSize::U16);
1166        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1167        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1168
1169        // Second field: u32 indices, float16 quantization, threshold 0.05
1170        let f2 = &indexes[0].fields[1];
1171        assert_eq!(f2.name, "embedding2");
1172        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1173        assert_eq!(config2.index_size, IndexSize::U32);
1174        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1175        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1176    }
1177
1178    #[test]
1179    fn test_dense_vector_field() {
1180        let sdl = r#"
1181            index documents {
1182                field embedding: dense_vector<768> [indexed, stored]
1183            }
1184        "#;
1185
1186        let indexes = parse_sdl(sdl).unwrap();
1187        assert_eq!(indexes.len(), 1);
1188        assert_eq!(indexes[0].fields.len(), 1);
1189
1190        let f = &indexes[0].fields[0];
1191        assert_eq!(f.name, "embedding");
1192        assert_eq!(f.field_type, FieldType::DenseVector);
1193
1194        let config = f.dense_vector_config.as_ref().unwrap();
1195        assert_eq!(config.dim, 768);
1196    }
1197
1198    #[test]
1199    fn test_dense_vector_alias() {
1200        let sdl = r#"
1201            index documents {
1202                field embedding: vector<1536> [indexed]
1203            }
1204        "#;
1205
1206        let indexes = parse_sdl(sdl).unwrap();
1207        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1208        assert_eq!(
1209            indexes[0].fields[0]
1210                .dense_vector_config
1211                .as_ref()
1212                .unwrap()
1213                .dim,
1214            1536
1215        );
1216    }
1217
1218    #[test]
1219    fn test_dense_vector_with_num_clusters() {
1220        let sdl = r#"
1221            index documents {
1222                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1223            }
1224        "#;
1225
1226        let indexes = parse_sdl(sdl).unwrap();
1227        assert_eq!(indexes.len(), 1);
1228
1229        let f = &indexes[0].fields[0];
1230        assert_eq!(f.name, "embedding");
1231        assert_eq!(f.field_type, FieldType::DenseVector);
1232
1233        let config = f.dense_vector_config.as_ref().unwrap();
1234        assert_eq!(config.dim, 768);
1235        assert_eq!(config.num_clusters, Some(256));
1236        assert_eq!(config.nprobe, 32); // default
1237    }
1238
1239    #[test]
1240    fn test_dense_vector_with_num_clusters_and_nprobe() {
1241        let sdl = r#"
1242            index documents {
1243                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1244            }
1245        "#;
1246
1247        let indexes = parse_sdl(sdl).unwrap();
1248        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1249
1250        assert_eq!(config.dim, 1536);
1251        assert_eq!(config.num_clusters, Some(512));
1252        assert_eq!(config.nprobe, 64);
1253    }
1254
1255    #[test]
1256    fn test_dense_vector_keyword_syntax() {
1257        let sdl = r#"
1258            index documents {
1259                field embedding: dense_vector<dims: 1536> [indexed, stored]
1260            }
1261        "#;
1262
1263        let indexes = parse_sdl(sdl).unwrap();
1264        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1265
1266        assert_eq!(config.dim, 1536);
1267        assert!(config.num_clusters.is_none());
1268    }
1269
1270    #[test]
1271    fn test_dense_vector_keyword_syntax_full() {
1272        let sdl = r#"
1273            index documents {
1274                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1275            }
1276        "#;
1277
1278        let indexes = parse_sdl(sdl).unwrap();
1279        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1280
1281        assert_eq!(config.dim, 1536);
1282        assert_eq!(config.num_clusters, Some(256));
1283        assert_eq!(config.nprobe, 64);
1284    }
1285
1286    #[test]
1287    fn test_dense_vector_keyword_syntax_partial() {
1288        let sdl = r#"
1289            index documents {
1290                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1291            }
1292        "#;
1293
1294        let indexes = parse_sdl(sdl).unwrap();
1295        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1296
1297        assert_eq!(config.dim, 768);
1298        assert_eq!(config.num_clusters, Some(128));
1299        assert_eq!(config.nprobe, 32); // default
1300    }
1301
1302    #[test]
1303    fn test_dense_vector_scann_index() {
1304        use crate::dsl::schema::VectorIndexType;
1305
1306        let sdl = r#"
1307            index documents {
1308                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1309            }
1310        "#;
1311
1312        let indexes = parse_sdl(sdl).unwrap();
1313        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1314
1315        assert_eq!(config.dim, 768);
1316        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1317        assert_eq!(config.num_clusters, Some(256));
1318        assert_eq!(config.nprobe, 64);
1319    }
1320
1321    #[test]
1322    fn test_dense_vector_ivf_rabitq_index() {
1323        use crate::dsl::schema::VectorIndexType;
1324
1325        let sdl = r#"
1326            index documents {
1327                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1328            }
1329        "#;
1330
1331        let indexes = parse_sdl(sdl).unwrap();
1332        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1333
1334        assert_eq!(config.dim, 1536);
1335        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1336        assert_eq!(config.num_clusters, Some(512));
1337    }
1338
1339    #[test]
1340    fn test_dense_vector_rabitq_no_clusters() {
1341        use crate::dsl::schema::VectorIndexType;
1342
1343        let sdl = r#"
1344            index documents {
1345                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1346            }
1347        "#;
1348
1349        let indexes = parse_sdl(sdl).unwrap();
1350        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1351
1352        assert_eq!(config.dim, 768);
1353        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1354        assert!(config.num_clusters.is_none());
1355    }
1356
1357    #[test]
1358    fn test_dense_vector_flat_index() {
1359        use crate::dsl::schema::VectorIndexType;
1360
1361        let sdl = r#"
1362            index documents {
1363                field embedding: dense_vector<dims: 768> [indexed<flat>]
1364            }
1365        "#;
1366
1367        let indexes = parse_sdl(sdl).unwrap();
1368        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1369
1370        assert_eq!(config.dim, 768);
1371        assert_eq!(config.index_type, VectorIndexType::Flat);
1372    }
1373
1374    #[test]
1375    fn test_dense_vector_default_index_type() {
1376        use crate::dsl::schema::VectorIndexType;
1377
1378        // When no index type specified, should default to RaBitQ (basic)
1379        let sdl = r#"
1380            index documents {
1381                field embedding: dense_vector<dims: 768> [indexed]
1382            }
1383        "#;
1384
1385        let indexes = parse_sdl(sdl).unwrap();
1386        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1387
1388        assert_eq!(config.dim, 768);
1389        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1390    }
1391
1392    #[test]
1393    fn test_dense_vector_mrl_dim() {
1394        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1395        let sdl = r#"
1396            index documents {
1397                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1398            }
1399        "#;
1400
1401        let indexes = parse_sdl(sdl).unwrap();
1402        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1403
1404        assert_eq!(config.dim, 1536);
1405        assert_eq!(config.mrl_dim, Some(256));
1406        assert_eq!(config.index_dim(), 256);
1407    }
1408
1409    #[test]
1410    fn test_dense_vector_mrl_dim_with_num_clusters() {
1411        // Test mrl_dim combined with other index options
1412        let sdl = r#"
1413            index documents {
1414                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1415            }
1416        "#;
1417
1418        let indexes = parse_sdl(sdl).unwrap();
1419        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1420
1421        assert_eq!(config.dim, 768);
1422        assert_eq!(config.mrl_dim, Some(128));
1423        assert_eq!(config.index_dim(), 128);
1424        assert_eq!(config.num_clusters, Some(256));
1425        assert_eq!(config.nprobe, 64);
1426    }
1427
1428    #[test]
1429    fn test_dense_vector_no_mrl_dim() {
1430        // Test that index_dim() returns full dim when mrl_dim is not set
1431        let sdl = r#"
1432            index documents {
1433                field embedding: dense_vector<dims: 768> [indexed]
1434            }
1435        "#;
1436
1437        let indexes = parse_sdl(sdl).unwrap();
1438        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1439
1440        assert_eq!(config.dim, 768);
1441        assert_eq!(config.mrl_dim, None);
1442        assert_eq!(config.index_dim(), 768);
1443    }
1444
1445    #[test]
1446    fn test_json_field_type() {
1447        let sdl = r#"
1448            index documents {
1449                field title: text [indexed, stored]
1450                field metadata: json [stored]
1451                field extra: json
1452            }
1453        "#;
1454
1455        let indexes = parse_sdl(sdl).unwrap();
1456        let index = &indexes[0];
1457
1458        assert_eq!(index.fields.len(), 3);
1459
1460        // Check JSON field
1461        assert_eq!(index.fields[1].name, "metadata");
1462        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1463        assert!(index.fields[1].stored);
1464        // JSON fields should not be indexed (enforced by add_json_field)
1465
1466        // Check default attributes for JSON field
1467        assert_eq!(index.fields[2].name, "extra");
1468        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1469
1470        // Verify schema conversion
1471        let schema = index.to_schema();
1472        let metadata_field = schema.get_field("metadata").unwrap();
1473        let entry = schema.get_field_entry(metadata_field).unwrap();
1474        assert_eq!(entry.field_type, FieldType::Json);
1475        assert!(!entry.indexed); // JSON fields are never indexed
1476        assert!(entry.stored);
1477    }
1478
1479    #[test]
1480    fn test_sparse_vector_query_config() {
1481        use crate::structures::QueryWeighting;
1482
1483        let sdl = r#"
1484            index documents {
1485                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1486            }
1487        "#;
1488
1489        let indexes = parse_sdl(sdl).unwrap();
1490        let index = &indexes[0];
1491
1492        assert_eq!(index.fields.len(), 1);
1493        assert_eq!(index.fields[0].name, "embedding");
1494        assert!(matches!(
1495            index.fields[0].field_type,
1496            FieldType::SparseVector
1497        ));
1498
1499        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1500        assert_eq!(config.index_size, IndexSize::U16);
1501        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1502
1503        // Check query config
1504        let query_config = config.query_config.as_ref().unwrap();
1505        assert_eq!(
1506            query_config.tokenizer.as_deref(),
1507            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1508        );
1509        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1510
1511        // Verify schema conversion preserves query config
1512        let schema = index.to_schema();
1513        let embedding_field = schema.get_field("embedding").unwrap();
1514        let entry = schema.get_field_entry(embedding_field).unwrap();
1515        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1516        let qc = sv_config.query_config.as_ref().unwrap();
1517        assert_eq!(
1518            qc.tokenizer.as_deref(),
1519            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1520        );
1521        assert_eq!(qc.weighting, QueryWeighting::Idf);
1522    }
1523
1524    #[test]
1525    fn test_sparse_vector_query_config_weighting_one() {
1526        use crate::structures::QueryWeighting;
1527
1528        let sdl = r#"
1529            index documents {
1530                field embedding: sparse_vector [indexed<query<weighting: one>>]
1531            }
1532        "#;
1533
1534        let indexes = parse_sdl(sdl).unwrap();
1535        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1536
1537        let query_config = config.query_config.as_ref().unwrap();
1538        assert!(query_config.tokenizer.is_none());
1539        assert_eq!(query_config.weighting, QueryWeighting::One);
1540    }
1541}