Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65/// Parsed field definition
66#[derive(Debug, Clone)]
67pub struct FieldDef {
68    pub name: String,
69    pub field_type: FieldType,
70    pub indexed: bool,
71    pub stored: bool,
72    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
73    pub tokenizer: Option<String>,
74    /// Whether this field can have multiple values (serialized as array in JSON)
75    pub multi: bool,
76    /// Position tracking mode for phrase queries and multi-field element tracking
77    pub positions: Option<super::schema::PositionMode>,
78    /// Configuration for sparse vector fields
79    pub sparse_vector_config: Option<SparseVectorConfig>,
80    /// Configuration for dense vector fields
81    pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84/// Parsed index definition
85#[derive(Debug, Clone)]
86pub struct IndexDef {
87    pub name: String,
88    pub fields: Vec<FieldDef>,
89    pub default_fields: Vec<String>,
90    /// Query router rules for routing queries to specific fields
91    pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95    /// Convert to a Schema
96    pub fn to_schema(&self) -> Schema {
97        let mut builder = SchemaBuilder::default();
98
99        for field in &self.fields {
100            let f = match field.field_type {
101                FieldType::Text => {
102                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103                    builder.add_text_field_with_tokenizer(
104                        &field.name,
105                        field.indexed,
106                        field.stored,
107                        tokenizer,
108                    )
109                }
110                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114                FieldType::Json => builder.add_json_field(&field.name, field.stored),
115                FieldType::SparseVector => {
116                    if let Some(config) = &field.sparse_vector_config {
117                        builder.add_sparse_vector_field_with_config(
118                            &field.name,
119                            field.indexed,
120                            field.stored,
121                            config.clone(),
122                        )
123                    } else {
124                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125                    }
126                }
127                FieldType::DenseVector => {
128                    // Dense vector dimension must be specified via config
129                    let config = field
130                        .dense_vector_config
131                        .as_ref()
132                        .expect("DenseVector field requires dimension to be specified");
133                    builder.add_dense_vector_field_with_config(
134                        &field.name,
135                        field.indexed,
136                        field.stored,
137                        config.clone(),
138                    )
139                }
140            };
141            if field.multi {
142                builder.set_multi(f, true);
143            }
144            // Set positions: explicit > auto (ordinal for multi vectors)
145            let positions = field.positions.or({
146                // Auto-set ordinal positions for multi-valued vector fields
147                if field.multi
148                    && matches!(
149                        field.field_type,
150                        FieldType::SparseVector | FieldType::DenseVector
151                    )
152                {
153                    Some(super::schema::PositionMode::Ordinal)
154                } else {
155                    None
156                }
157            });
158            if let Some(mode) = positions {
159                builder.set_positions(f, mode);
160            }
161        }
162
163        // Set default fields if specified
164        if !self.default_fields.is_empty() {
165            builder.set_default_fields(self.default_fields.clone());
166        }
167
168        // Set query routers if specified
169        if !self.query_routers.is_empty() {
170            builder.set_query_routers(self.query_routers.clone());
171        }
172
173        builder.build()
174    }
175
176    /// Create a QueryFieldRouter from the query router rules
177    ///
178    /// Returns None if there are no query router rules defined.
179    /// Returns Err if any regex pattern is invalid.
180    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181        if self.query_routers.is_empty() {
182            return Ok(None);
183        }
184
185        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186            .map(Some)
187            .map_err(Error::Schema)
188    }
189}
190
191/// Parse field type from string
192fn parse_field_type(type_str: &str) -> Result<FieldType> {
193    match type_str {
194        "text" | "string" | "str" => Ok(FieldType::Text),
195        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196        "i64" | "int" | "integer" => Ok(FieldType::I64),
197        "f64" | "float" | "double" => Ok(FieldType::F64),
198        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199        "json" => Ok(FieldType::Json),
200        "sparse_vector" => Ok(FieldType::SparseVector),
201        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203    }
204}
205
206/// Index configuration parsed from indexed<...> attribute
207#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209    index_type: Option<super::schema::VectorIndexType>,
210    centroids_path: Option<String>,
211    codebook_path: Option<String>,
212    nprobe: Option<usize>,
213    mrl_dim: Option<usize>,
214    // Sparse vector index params
215    quantization: Option<WeightQuantization>,
216    weight_threshold: Option<f32>,
217    // Sparse vector query-time config
218    query_tokenizer: Option<String>,
219    query_weighting: Option<QueryWeighting>,
220    // Position tracking mode for phrase queries
221    positions: Option<super::schema::PositionMode>,
222}
223
224/// Parse attributes from pest pair
225/// Returns (indexed, stored, multi, index_config)
226/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
227/// multi is now inside stored<multi>
228fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
229    let mut indexed = false;
230    let mut stored = false;
231    let mut multi = false;
232    let mut index_config = None;
233
234    for attr in pair.into_inner() {
235        if attr.as_rule() == Rule::attribute {
236            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
237            let mut found_config = false;
238            for inner in attr.clone().into_inner() {
239                match inner.as_rule() {
240                    Rule::indexed_with_config => {
241                        indexed = true;
242                        index_config = Some(parse_index_config(inner));
243                        found_config = true;
244                        break;
245                    }
246                    Rule::stored_with_config => {
247                        stored = true;
248                        multi = true; // stored<multi>
249                        found_config = true;
250                        break;
251                    }
252                    _ => {}
253                }
254            }
255            if !found_config {
256                // Simple attribute
257                match attr.as_str() {
258                    "indexed" => indexed = true,
259                    "stored" => stored = true,
260                    _ => {}
261                }
262            }
263        }
264    }
265
266    (indexed, stored, multi, index_config)
267}
268
269/// Parse index configuration from indexed<...> attribute
270fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
271    let mut config = IndexConfig::default();
272
273    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
274    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
275    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
276
277    for inner in pair.into_inner() {
278        if inner.as_rule() == Rule::index_config_params {
279            for param in inner.into_inner() {
280                if param.as_rule() == Rule::index_config_param {
281                    for p in param.into_inner() {
282                        parse_single_index_config_param(&mut config, p);
283                    }
284                }
285            }
286        }
287    }
288
289    config
290}
291
292/// Parse a single index config parameter
293fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
294    use super::schema::VectorIndexType;
295
296    match p.as_rule() {
297        Rule::index_type_spec => {
298            config.index_type = Some(match p.as_str() {
299                "scann" => VectorIndexType::ScaNN,
300                "rabitq" => VectorIndexType::IvfRaBitQ,
301                _ => VectorIndexType::IvfRaBitQ,
302            });
303        }
304        Rule::index_type_kwarg => {
305            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
306            if let Some(t) = p.into_inner().next() {
307                config.index_type = Some(match t.as_str() {
308                    "scann" => VectorIndexType::ScaNN,
309                    "rabitq" => VectorIndexType::IvfRaBitQ,
310                    _ => VectorIndexType::IvfRaBitQ,
311                });
312            }
313        }
314        Rule::centroids_kwarg => {
315            // centroids_kwarg = { "centroids" ~ ":" ~ centroids_path }
316            // centroids_path = { "\"" ~ path_chars ~ "\"" }
317            if let Some(path) = p.into_inner().next()
318                && let Some(inner_path) = path.into_inner().next()
319            {
320                config.centroids_path = Some(inner_path.as_str().to_string());
321            }
322        }
323        Rule::codebook_kwarg => {
324            // codebook_kwarg = { "codebook" ~ ":" ~ codebook_path }
325            if let Some(path) = p.into_inner().next()
326                && let Some(inner_path) = path.into_inner().next()
327            {
328                config.codebook_path = Some(inner_path.as_str().to_string());
329            }
330        }
331        Rule::nprobe_kwarg => {
332            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
333            if let Some(n) = p.into_inner().next() {
334                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
335            }
336        }
337        Rule::mrl_dim_kwarg => {
338            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
339            if let Some(n) = p.into_inner().next() {
340                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
341            }
342        }
343        Rule::quantization_kwarg => {
344            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
345            if let Some(q) = p.into_inner().next() {
346                config.quantization = Some(match q.as_str() {
347                    "float32" | "f32" => WeightQuantization::Float32,
348                    "float16" | "f16" => WeightQuantization::Float16,
349                    "uint8" | "u8" => WeightQuantization::UInt8,
350                    "uint4" | "u4" => WeightQuantization::UInt4,
351                    _ => WeightQuantization::default(),
352                });
353            }
354        }
355        Rule::weight_threshold_kwarg => {
356            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
357            if let Some(t) = p.into_inner().next() {
358                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
359            }
360        }
361        Rule::query_config_block => {
362            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
363            parse_query_config_block(config, p);
364        }
365        Rule::positions_kwarg => {
366            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
367            use super::schema::PositionMode;
368            config.positions = Some(match p.as_str() {
369                "ordinal" => PositionMode::Ordinal,
370                "token_position" => PositionMode::TokenPosition,
371                _ => PositionMode::Full, // "positions" or any other value defaults to Full
372            });
373        }
374        _ => {}
375    }
376}
377
378/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
379fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
380    for inner in pair.into_inner() {
381        if inner.as_rule() == Rule::query_config_params {
382            for param in inner.into_inner() {
383                if param.as_rule() == Rule::query_config_param {
384                    for p in param.into_inner() {
385                        match p.as_rule() {
386                            Rule::query_tokenizer_kwarg => {
387                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
388                                if let Some(path) = p.into_inner().next()
389                                    && let Some(inner_path) = path.into_inner().next()
390                                {
391                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
392                                }
393                            }
394                            Rule::query_weighting_kwarg => {
395                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
396                                if let Some(w) = p.into_inner().next() {
397                                    config.query_weighting = Some(match w.as_str() {
398                                        "one" => QueryWeighting::One,
399                                        "idf" => QueryWeighting::Idf,
400                                        _ => QueryWeighting::One,
401                                    });
402                                }
403                            }
404                            _ => {}
405                        }
406                    }
407                }
408            }
409        }
410    }
411}
412
413/// Parse a field definition from pest pair
414fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
415    let mut inner = pair.into_inner();
416
417    let name = inner
418        .next()
419        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
420        .as_str()
421        .to_string();
422
423    let field_type_str = inner
424        .next()
425        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
426        .as_str();
427
428    let field_type = parse_field_type(field_type_str)?;
429
430    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
431    let mut tokenizer = None;
432    let mut sparse_vector_config = None;
433    let mut dense_vector_config = None;
434    let mut indexed = true;
435    let mut stored = true;
436    let mut multi = false;
437    let mut index_config: Option<IndexConfig> = None;
438
439    for item in inner {
440        match item.as_rule() {
441            Rule::tokenizer_spec => {
442                // Extract tokenizer name from <name>
443                if let Some(tok_name) = item.into_inner().next() {
444                    tokenizer = Some(tok_name.as_str().to_string());
445                }
446            }
447            Rule::sparse_vector_config => {
448                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
449                sparse_vector_config = Some(parse_sparse_vector_config(item));
450            }
451            Rule::dense_vector_config => {
452                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
453                dense_vector_config = Some(parse_dense_vector_config(item));
454            }
455            Rule::attributes => {
456                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
457                indexed = idx;
458                stored = sto;
459                multi = mul;
460                index_config = idx_cfg;
461            }
462            _ => {}
463        }
464    }
465
466    // Merge index config into vector configs if both exist
467    let mut positions = None;
468    if let Some(idx_cfg) = index_config {
469        positions = idx_cfg.positions;
470        if let Some(ref mut dv_config) = dense_vector_config {
471            apply_index_config_to_dense_vector(dv_config, idx_cfg);
472        } else if field_type == FieldType::SparseVector {
473            // For sparse vectors, create default config if not present and apply index params
474            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
475            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
476        }
477    }
478
479    Ok(FieldDef {
480        name,
481        field_type,
482        indexed,
483        stored,
484        tokenizer,
485        multi,
486        positions,
487        sparse_vector_config,
488        dense_vector_config,
489    })
490}
491
492/// Apply index configuration from indexed<...> to DenseVectorConfig
493fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
494    use super::schema::VectorIndexType;
495
496    let nprobe = idx_cfg.nprobe.unwrap_or(32);
497
498    match idx_cfg.index_type {
499        Some(VectorIndexType::ScaNN) => {
500            config.index_type = VectorIndexType::ScaNN;
501            config.coarse_centroids_path = idx_cfg.centroids_path;
502            config.pq_codebook_path = idx_cfg.codebook_path;
503            config.nprobe = nprobe;
504        }
505        Some(VectorIndexType::IvfRaBitQ) => {
506            config.index_type = VectorIndexType::IvfRaBitQ;
507            config.coarse_centroids_path = idx_cfg.centroids_path;
508            config.nprobe = nprobe;
509        }
510        Some(VectorIndexType::RaBitQ) | None => {
511            // If centroids provided, use IVF-RaBitQ, otherwise plain RaBitQ
512            if idx_cfg.centroids_path.is_some() {
513                config.index_type = VectorIndexType::IvfRaBitQ;
514                config.coarse_centroids_path = idx_cfg.centroids_path;
515                config.nprobe = nprobe;
516            }
517            // else keep default RaBitQ
518        }
519    }
520
521    // Apply mrl_dim if specified
522    if idx_cfg.mrl_dim.is_some() {
523        config.mrl_dim = idx_cfg.mrl_dim;
524    }
525}
526
527/// Parse sparse_vector_config - only index_size (positional)
528/// Example: <u16> or <u32>
529fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
530    let mut index_size = IndexSize::default();
531
532    // Parse positional index_size_spec
533    for inner in pair.into_inner() {
534        if inner.as_rule() == Rule::index_size_spec {
535            index_size = match inner.as_str() {
536                "u16" => IndexSize::U16,
537                "u32" => IndexSize::U32,
538                _ => IndexSize::default(),
539            };
540        }
541    }
542
543    SparseVectorConfig {
544        index_size,
545        weight_quantization: WeightQuantization::default(),
546        weight_threshold: 0.0,
547        posting_list_pruning: None,
548        query_config: None,
549    }
550}
551
552/// Apply index configuration from indexed<...> to SparseVectorConfig
553fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
554    if let Some(q) = idx_cfg.quantization {
555        config.weight_quantization = q;
556    }
557    if let Some(t) = idx_cfg.weight_threshold {
558        config.weight_threshold = t;
559    }
560    // Apply query-time configuration if present
561    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
562        let query_config = config
563            .query_config
564            .get_or_insert(SparseQueryConfig::default());
565        if let Some(tokenizer) = idx_cfg.query_tokenizer {
566            query_config.tokenizer = Some(tokenizer);
567        }
568        if let Some(weighting) = idx_cfg.query_weighting {
569            query_config.weighting = weighting;
570        }
571    }
572}
573
574/// Parse dense_vector_config - only dims
575/// All index-related params (including mrl_dim) are now in indexed<...> attribute
576fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
577    let mut dim: usize = 0;
578
579    // Navigate to dense_vector_params
580    for params in pair.into_inner() {
581        if params.as_rule() == Rule::dense_vector_params {
582            for inner in params.into_inner() {
583                match inner.as_rule() {
584                    Rule::dense_vector_keyword_params => {
585                        // Parse keyword args: dims: N
586                        for kwarg in inner.into_inner() {
587                            if kwarg.as_rule() == Rule::dims_kwarg
588                                && let Some(d) = kwarg.into_inner().next()
589                            {
590                                dim = d.as_str().parse().unwrap_or(0);
591                            }
592                        }
593                    }
594                    Rule::dense_vector_positional_params => {
595                        // Parse positional: just dimension
596                        if let Some(dim_pair) = inner.into_inner().next() {
597                            dim = dim_pair.as_str().parse().unwrap_or(0);
598                        }
599                    }
600                    _ => {}
601                }
602            }
603        }
604    }
605
606    DenseVectorConfig::new(dim)
607}
608
609/// Parse default_fields definition
610fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
611    pair.into_inner().map(|p| p.as_str().to_string()).collect()
612}
613
614/// Parse a query router definition
615fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
616    let mut pattern = String::new();
617    let mut substitution = String::new();
618    let mut target_field = String::new();
619    let mut mode = RoutingMode::Additional;
620
621    for prop in pair.into_inner() {
622        if prop.as_rule() != Rule::query_router_prop {
623            continue;
624        }
625
626        for inner in prop.into_inner() {
627            match inner.as_rule() {
628                Rule::query_router_pattern => {
629                    if let Some(regex_str) = inner.into_inner().next() {
630                        pattern = parse_string_value(regex_str);
631                    }
632                }
633                Rule::query_router_substitution => {
634                    if let Some(quoted) = inner.into_inner().next() {
635                        substitution = parse_string_value(quoted);
636                    }
637                }
638                Rule::query_router_target => {
639                    if let Some(ident) = inner.into_inner().next() {
640                        target_field = ident.as_str().to_string();
641                    }
642                }
643                Rule::query_router_mode => {
644                    if let Some(mode_val) = inner.into_inner().next() {
645                        mode = match mode_val.as_str() {
646                            "exclusive" => RoutingMode::Exclusive,
647                            "additional" => RoutingMode::Additional,
648                            _ => RoutingMode::Additional,
649                        };
650                    }
651                }
652                _ => {}
653            }
654        }
655    }
656
657    if pattern.is_empty() {
658        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
659    }
660    if substitution.is_empty() {
661        return Err(Error::Schema(
662            "query_router missing 'substitution'".to_string(),
663        ));
664    }
665    if target_field.is_empty() {
666        return Err(Error::Schema(
667            "query_router missing 'target_field'".to_string(),
668        ));
669    }
670
671    Ok(QueryRouterRule {
672        pattern,
673        substitution,
674        target_field,
675        mode,
676    })
677}
678
679/// Parse a string value from quoted_string, raw_string, or regex_string
680fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
681    let s = pair.as_str();
682    match pair.as_rule() {
683        Rule::regex_string => {
684            // regex_string contains either raw_string or quoted_string
685            if let Some(inner) = pair.into_inner().next() {
686                parse_string_value(inner)
687            } else {
688                s.to_string()
689            }
690        }
691        Rule::raw_string => {
692            // r"..." - strip r" prefix and " suffix
693            s[2..s.len() - 1].to_string()
694        }
695        Rule::quoted_string => {
696            // "..." - strip quotes and handle escapes
697            let inner = &s[1..s.len() - 1];
698            // Simple escape handling
699            inner
700                .replace("\\n", "\n")
701                .replace("\\t", "\t")
702                .replace("\\\"", "\"")
703                .replace("\\\\", "\\")
704        }
705        _ => s.to_string(),
706    }
707}
708
709/// Parse an index definition from pest pair
710fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
711    let mut inner = pair.into_inner();
712
713    let name = inner
714        .next()
715        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
716        .as_str()
717        .to_string();
718
719    let mut fields = Vec::new();
720    let mut default_fields = Vec::new();
721    let mut query_routers = Vec::new();
722
723    for item in inner {
724        match item.as_rule() {
725            Rule::field_def => {
726                fields.push(parse_field_def(item)?);
727            }
728            Rule::default_fields_def => {
729                default_fields = parse_default_fields_def(item);
730            }
731            Rule::query_router_def => {
732                query_routers.push(parse_query_router_def(item)?);
733            }
734            _ => {}
735        }
736    }
737
738    Ok(IndexDef {
739        name,
740        fields,
741        default_fields,
742        query_routers,
743    })
744}
745
746/// Parse SDL from a string
747pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
748    let pairs = SdlParser::parse(Rule::file, input)
749        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
750
751    let mut indexes = Vec::new();
752
753    for pair in pairs {
754        if pair.as_rule() == Rule::file {
755            for inner in pair.into_inner() {
756                if inner.as_rule() == Rule::index_def {
757                    indexes.push(parse_index_def(inner)?);
758                }
759            }
760        }
761    }
762
763    Ok(indexes)
764}
765
766/// Parse SDL and return a single index definition
767pub fn parse_single_index(input: &str) -> Result<IndexDef> {
768    let indexes = parse_sdl(input)?;
769
770    if indexes.is_empty() {
771        return Err(Error::Schema("No index definition found".to_string()));
772    }
773
774    if indexes.len() > 1 {
775        return Err(Error::Schema(
776            "Multiple index definitions found, expected one".to_string(),
777        ));
778    }
779
780    Ok(indexes.into_iter().next().unwrap())
781}
782
783#[cfg(test)]
784mod tests {
785    use super::*;
786
787    #[test]
788    fn test_parse_simple_schema() {
789        let sdl = r#"
790            index articles {
791                field title: text [indexed, stored]
792                field body: text [indexed]
793            }
794        "#;
795
796        let indexes = parse_sdl(sdl).unwrap();
797        assert_eq!(indexes.len(), 1);
798
799        let index = &indexes[0];
800        assert_eq!(index.name, "articles");
801        assert_eq!(index.fields.len(), 2);
802
803        assert_eq!(index.fields[0].name, "title");
804        assert!(matches!(index.fields[0].field_type, FieldType::Text));
805        assert!(index.fields[0].indexed);
806        assert!(index.fields[0].stored);
807
808        assert_eq!(index.fields[1].name, "body");
809        assert!(matches!(index.fields[1].field_type, FieldType::Text));
810        assert!(index.fields[1].indexed);
811        assert!(!index.fields[1].stored);
812    }
813
814    #[test]
815    fn test_parse_all_field_types() {
816        let sdl = r#"
817            index test {
818                field text_field: text [indexed, stored]
819                field u64_field: u64 [indexed, stored]
820                field i64_field: i64 [indexed, stored]
821                field f64_field: f64 [indexed, stored]
822                field bytes_field: bytes [stored]
823            }
824        "#;
825
826        let indexes = parse_sdl(sdl).unwrap();
827        let index = &indexes[0];
828
829        assert!(matches!(index.fields[0].field_type, FieldType::Text));
830        assert!(matches!(index.fields[1].field_type, FieldType::U64));
831        assert!(matches!(index.fields[2].field_type, FieldType::I64));
832        assert!(matches!(index.fields[3].field_type, FieldType::F64));
833        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
834    }
835
836    #[test]
837    fn test_parse_with_comments() {
838        let sdl = r#"
839            # This is a comment
840            index articles {
841                # Title field
842                field title: text [indexed, stored]
843                field body: text [indexed] # inline comment not supported yet
844            }
845        "#;
846
847        let indexes = parse_sdl(sdl).unwrap();
848        assert_eq!(indexes[0].fields.len(), 2);
849    }
850
851    #[test]
852    fn test_parse_type_aliases() {
853        let sdl = r#"
854            index test {
855                field a: string [indexed]
856                field b: int [indexed]
857                field c: uint [indexed]
858                field d: float [indexed]
859                field e: binary [stored]
860            }
861        "#;
862
863        let indexes = parse_sdl(sdl).unwrap();
864        let index = &indexes[0];
865
866        assert!(matches!(index.fields[0].field_type, FieldType::Text));
867        assert!(matches!(index.fields[1].field_type, FieldType::I64));
868        assert!(matches!(index.fields[2].field_type, FieldType::U64));
869        assert!(matches!(index.fields[3].field_type, FieldType::F64));
870        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
871    }
872
873    #[test]
874    fn test_to_schema() {
875        let sdl = r#"
876            index articles {
877                field title: text [indexed, stored]
878                field views: u64 [indexed, stored]
879            }
880        "#;
881
882        let indexes = parse_sdl(sdl).unwrap();
883        let schema = indexes[0].to_schema();
884
885        assert!(schema.get_field("title").is_some());
886        assert!(schema.get_field("views").is_some());
887        assert!(schema.get_field("nonexistent").is_none());
888    }
889
890    #[test]
891    fn test_default_attributes() {
892        let sdl = r#"
893            index test {
894                field title: text
895            }
896        "#;
897
898        let indexes = parse_sdl(sdl).unwrap();
899        let field = &indexes[0].fields[0];
900
901        // Default should be indexed and stored
902        assert!(field.indexed);
903        assert!(field.stored);
904    }
905
906    #[test]
907    fn test_multiple_indexes() {
908        let sdl = r#"
909            index articles {
910                field title: text [indexed, stored]
911            }
912
913            index users {
914                field name: text [indexed, stored]
915                field email: text [indexed, stored]
916            }
917        "#;
918
919        let indexes = parse_sdl(sdl).unwrap();
920        assert_eq!(indexes.len(), 2);
921        assert_eq!(indexes[0].name, "articles");
922        assert_eq!(indexes[1].name, "users");
923    }
924
925    #[test]
926    fn test_tokenizer_spec() {
927        let sdl = r#"
928            index articles {
929                field title: text<en_stem> [indexed, stored]
930                field body: text<default> [indexed]
931                field author: text [indexed, stored]
932            }
933        "#;
934
935        let indexes = parse_sdl(sdl).unwrap();
936        let index = &indexes[0];
937
938        assert_eq!(index.fields[0].name, "title");
939        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
940
941        assert_eq!(index.fields[1].name, "body");
942        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
943
944        assert_eq!(index.fields[2].name, "author");
945        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
946    }
947
948    #[test]
949    fn test_tokenizer_in_schema() {
950        let sdl = r#"
951            index articles {
952                field title: text<german> [indexed, stored]
953                field body: text<en_stem> [indexed]
954            }
955        "#;
956
957        let indexes = parse_sdl(sdl).unwrap();
958        let schema = indexes[0].to_schema();
959
960        let title_field = schema.get_field("title").unwrap();
961        let title_entry = schema.get_field_entry(title_field).unwrap();
962        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
963
964        let body_field = schema.get_field("body").unwrap();
965        let body_entry = schema.get_field_entry(body_field).unwrap();
966        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
967    }
968
969    #[test]
970    fn test_query_router_basic() {
971        let sdl = r#"
972            index documents {
973                field title: text [indexed, stored]
974                field uri: text [indexed, stored]
975
976                query_router {
977                    pattern: "10\\.\\d{4,}/[^\\s]+"
978                    substitution: "doi://{0}"
979                    target_field: uris
980                    mode: exclusive
981                }
982            }
983        "#;
984
985        let indexes = parse_sdl(sdl).unwrap();
986        let index = &indexes[0];
987
988        assert_eq!(index.query_routers.len(), 1);
989        let router = &index.query_routers[0];
990        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
991        assert_eq!(router.substitution, "doi://{0}");
992        assert_eq!(router.target_field, "uris");
993        assert_eq!(router.mode, RoutingMode::Exclusive);
994    }
995
996    #[test]
997    fn test_query_router_raw_string() {
998        let sdl = r#"
999            index documents {
1000                field uris: text [indexed, stored]
1001
1002                query_router {
1003                    pattern: r"^pmid:(\d+)$"
1004                    substitution: "pubmed://{1}"
1005                    target_field: uris
1006                    mode: additional
1007                }
1008            }
1009        "#;
1010
1011        let indexes = parse_sdl(sdl).unwrap();
1012        let router = &indexes[0].query_routers[0];
1013
1014        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1015        assert_eq!(router.substitution, "pubmed://{1}");
1016        assert_eq!(router.mode, RoutingMode::Additional);
1017    }
1018
1019    #[test]
1020    fn test_multiple_query_routers() {
1021        let sdl = r#"
1022            index documents {
1023                field uris: text [indexed, stored]
1024
1025                query_router {
1026                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1027                    substitution: "doi://{1}"
1028                    target_field: uris
1029                    mode: exclusive
1030                }
1031
1032                query_router {
1033                    pattern: r"^pmid:(\d+)$"
1034                    substitution: "pubmed://{1}"
1035                    target_field: uris
1036                    mode: exclusive
1037                }
1038
1039                query_router {
1040                    pattern: r"^arxiv:(\d+\.\d+)$"
1041                    substitution: "arxiv://{1}"
1042                    target_field: uris
1043                    mode: additional
1044                }
1045            }
1046        "#;
1047
1048        let indexes = parse_sdl(sdl).unwrap();
1049        assert_eq!(indexes[0].query_routers.len(), 3);
1050    }
1051
1052    #[test]
1053    fn test_query_router_default_mode() {
1054        let sdl = r#"
1055            index documents {
1056                field uris: text [indexed, stored]
1057
1058                query_router {
1059                    pattern: r"test"
1060                    substitution: "{0}"
1061                    target_field: uris
1062                }
1063            }
1064        "#;
1065
1066        let indexes = parse_sdl(sdl).unwrap();
1067        // Default mode should be Additional
1068        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1069    }
1070
1071    #[test]
1072    fn test_multi_attribute() {
1073        let sdl = r#"
1074            index documents {
1075                field uris: text [indexed, stored<multi>]
1076                field title: text [indexed, stored]
1077            }
1078        "#;
1079
1080        let indexes = parse_sdl(sdl).unwrap();
1081        assert_eq!(indexes.len(), 1);
1082
1083        let fields = &indexes[0].fields;
1084        assert_eq!(fields.len(), 2);
1085
1086        // uris should have multi=true
1087        assert_eq!(fields[0].name, "uris");
1088        assert!(fields[0].multi, "uris field should have multi=true");
1089
1090        // title should have multi=false
1091        assert_eq!(fields[1].name, "title");
1092        assert!(!fields[1].multi, "title field should have multi=false");
1093
1094        // Verify schema conversion preserves multi attribute
1095        let schema = indexes[0].to_schema();
1096        let uris_field = schema.get_field("uris").unwrap();
1097        let title_field = schema.get_field("title").unwrap();
1098
1099        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1100        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1101    }
1102
1103    #[test]
1104    fn test_sparse_vector_field() {
1105        let sdl = r#"
1106            index documents {
1107                field embedding: sparse_vector [indexed, stored]
1108            }
1109        "#;
1110
1111        let indexes = parse_sdl(sdl).unwrap();
1112        assert_eq!(indexes.len(), 1);
1113        assert_eq!(indexes[0].fields.len(), 1);
1114        assert_eq!(indexes[0].fields[0].name, "embedding");
1115        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1116        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1117    }
1118
1119    #[test]
1120    fn test_sparse_vector_with_config() {
1121        let sdl = r#"
1122            index documents {
1123                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1124                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1125            }
1126        "#;
1127
1128        let indexes = parse_sdl(sdl).unwrap();
1129        assert_eq!(indexes[0].fields.len(), 2);
1130
1131        // First field: u16 indices, uint8 quantization
1132        let f1 = &indexes[0].fields[0];
1133        assert_eq!(f1.name, "embedding");
1134        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1135        assert_eq!(config1.index_size, IndexSize::U16);
1136        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1137
1138        // Second field: u32 indices, float32 quantization
1139        let f2 = &indexes[0].fields[1];
1140        assert_eq!(f2.name, "dense");
1141        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1142        assert_eq!(config2.index_size, IndexSize::U32);
1143        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1144    }
1145
1146    #[test]
1147    fn test_sparse_vector_with_weight_threshold() {
1148        let sdl = r#"
1149            index documents {
1150                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1151                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1152            }
1153        "#;
1154
1155        let indexes = parse_sdl(sdl).unwrap();
1156        assert_eq!(indexes[0].fields.len(), 2);
1157
1158        // First field: u16 indices, uint8 quantization, threshold 0.1
1159        let f1 = &indexes[0].fields[0];
1160        assert_eq!(f1.name, "embedding");
1161        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1162        assert_eq!(config1.index_size, IndexSize::U16);
1163        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1164        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1165
1166        // Second field: u32 indices, float16 quantization, threshold 0.05
1167        let f2 = &indexes[0].fields[1];
1168        assert_eq!(f2.name, "embedding2");
1169        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1170        assert_eq!(config2.index_size, IndexSize::U32);
1171        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1172        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1173    }
1174
1175    #[test]
1176    fn test_dense_vector_field() {
1177        let sdl = r#"
1178            index documents {
1179                field embedding: dense_vector<768> [indexed, stored]
1180            }
1181        "#;
1182
1183        let indexes = parse_sdl(sdl).unwrap();
1184        assert_eq!(indexes.len(), 1);
1185        assert_eq!(indexes[0].fields.len(), 1);
1186
1187        let f = &indexes[0].fields[0];
1188        assert_eq!(f.name, "embedding");
1189        assert_eq!(f.field_type, FieldType::DenseVector);
1190
1191        let config = f.dense_vector_config.as_ref().unwrap();
1192        assert_eq!(config.dim, 768);
1193    }
1194
1195    #[test]
1196    fn test_dense_vector_alias() {
1197        let sdl = r#"
1198            index documents {
1199                field embedding: vector<1536> [indexed]
1200            }
1201        "#;
1202
1203        let indexes = parse_sdl(sdl).unwrap();
1204        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1205        assert_eq!(
1206            indexes[0].fields[0]
1207                .dense_vector_config
1208                .as_ref()
1209                .unwrap()
1210                .dim,
1211            1536
1212        );
1213    }
1214
1215    #[test]
1216    fn test_dense_vector_with_centroids() {
1217        let sdl = r#"
1218            index documents {
1219                field embedding: dense_vector<768> [indexed<centroids: "centroids.bin">, stored]
1220            }
1221        "#;
1222
1223        let indexes = parse_sdl(sdl).unwrap();
1224        assert_eq!(indexes.len(), 1);
1225
1226        let f = &indexes[0].fields[0];
1227        assert_eq!(f.name, "embedding");
1228        assert_eq!(f.field_type, FieldType::DenseVector);
1229
1230        let config = f.dense_vector_config.as_ref().unwrap();
1231        assert_eq!(config.dim, 768);
1232        assert_eq!(
1233            config.coarse_centroids_path.as_deref(),
1234            Some("centroids.bin")
1235        );
1236        assert_eq!(config.nprobe, 32); // default
1237    }
1238
1239    #[test]
1240    fn test_dense_vector_with_centroids_and_nprobe() {
1241        let sdl = r#"
1242            index documents {
1243                field embedding: dense_vector<1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1244            }
1245        "#;
1246
1247        let indexes = parse_sdl(sdl).unwrap();
1248        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1249
1250        assert_eq!(config.dim, 1536);
1251        assert_eq!(
1252            config.coarse_centroids_path.as_deref(),
1253            Some("/path/to/centroids.bin")
1254        );
1255        assert_eq!(config.nprobe, 64);
1256    }
1257
1258    #[test]
1259    fn test_dense_vector_keyword_syntax() {
1260        let sdl = r#"
1261            index documents {
1262                field embedding: dense_vector<dims: 1536> [indexed, stored]
1263            }
1264        "#;
1265
1266        let indexes = parse_sdl(sdl).unwrap();
1267        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1268
1269        assert_eq!(config.dim, 1536);
1270        assert!(config.coarse_centroids_path.is_none());
1271    }
1272
1273    #[test]
1274    fn test_dense_vector_keyword_syntax_full() {
1275        let sdl = r#"
1276            index documents {
1277                field embedding: dense_vector<dims: 1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1278            }
1279        "#;
1280
1281        let indexes = parse_sdl(sdl).unwrap();
1282        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1283
1284        assert_eq!(config.dim, 1536);
1285        assert_eq!(
1286            config.coarse_centroids_path.as_deref(),
1287            Some("/path/to/centroids.bin")
1288        );
1289        assert_eq!(config.nprobe, 64);
1290    }
1291
1292    #[test]
1293    fn test_dense_vector_keyword_syntax_partial() {
1294        let sdl = r#"
1295            index documents {
1296                field embedding: dense_vector<dims: 768> [indexed<centroids: "centroids.bin">]
1297            }
1298        "#;
1299
1300        let indexes = parse_sdl(sdl).unwrap();
1301        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1302
1303        assert_eq!(config.dim, 768);
1304        assert_eq!(
1305            config.coarse_centroids_path.as_deref(),
1306            Some("centroids.bin")
1307        );
1308        assert_eq!(config.nprobe, 32); // default
1309    }
1310
1311    #[test]
1312    fn test_dense_vector_scann_index() {
1313        use crate::dsl::schema::VectorIndexType;
1314
1315        let sdl = r#"
1316            index documents {
1317                field embedding: dense_vector<dims: 768> [indexed<scann, centroids: "centroids.bin", codebook: "pq_codebook.bin", nprobe: 64>]
1318            }
1319        "#;
1320
1321        let indexes = parse_sdl(sdl).unwrap();
1322        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1323
1324        assert_eq!(config.dim, 768);
1325        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1326        assert_eq!(
1327            config.coarse_centroids_path.as_deref(),
1328            Some("centroids.bin")
1329        );
1330        assert_eq!(config.pq_codebook_path.as_deref(), Some("pq_codebook.bin"));
1331        assert_eq!(config.nprobe, 64);
1332    }
1333
1334    #[test]
1335    fn test_dense_vector_rabitq_index() {
1336        use crate::dsl::schema::VectorIndexType;
1337
1338        let sdl = r#"
1339            index documents {
1340                field embedding: dense_vector<dims: 1536> [indexed<rabitq, centroids: "centroids.bin">]
1341            }
1342        "#;
1343
1344        let indexes = parse_sdl(sdl).unwrap();
1345        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1346
1347        assert_eq!(config.dim, 1536);
1348        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1349        assert_eq!(
1350            config.coarse_centroids_path.as_deref(),
1351            Some("centroids.bin")
1352        );
1353        assert!(config.pq_codebook_path.is_none());
1354    }
1355
1356    #[test]
1357    fn test_dense_vector_rabitq_no_centroids() {
1358        use crate::dsl::schema::VectorIndexType;
1359
1360        let sdl = r#"
1361            index documents {
1362                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1363            }
1364        "#;
1365
1366        let indexes = parse_sdl(sdl).unwrap();
1367        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1368
1369        assert_eq!(config.dim, 768);
1370        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1371        assert!(config.coarse_centroids_path.is_none());
1372    }
1373
1374    #[test]
1375    fn test_dense_vector_default_index_type() {
1376        use crate::dsl::schema::VectorIndexType;
1377
1378        // When no index type specified, should default to RaBitQ (basic)
1379        let sdl = r#"
1380            index documents {
1381                field embedding: dense_vector<dims: 768> [indexed]
1382            }
1383        "#;
1384
1385        let indexes = parse_sdl(sdl).unwrap();
1386        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1387
1388        assert_eq!(config.dim, 768);
1389        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1390    }
1391
1392    #[test]
1393    fn test_dense_vector_mrl_dim() {
1394        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1395        let sdl = r#"
1396            index documents {
1397                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1398            }
1399        "#;
1400
1401        let indexes = parse_sdl(sdl).unwrap();
1402        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1403
1404        assert_eq!(config.dim, 1536);
1405        assert_eq!(config.mrl_dim, Some(256));
1406        assert_eq!(config.index_dim(), 256);
1407    }
1408
1409    #[test]
1410    fn test_dense_vector_mrl_dim_with_centroids() {
1411        // Test mrl_dim combined with other index options
1412        let sdl = r#"
1413            index documents {
1414                field embedding: dense_vector<768> [indexed<centroids: "centroids.bin", nprobe: 64, mrl_dim: 128>]
1415            }
1416        "#;
1417
1418        let indexes = parse_sdl(sdl).unwrap();
1419        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1420
1421        assert_eq!(config.dim, 768);
1422        assert_eq!(config.mrl_dim, Some(128));
1423        assert_eq!(config.index_dim(), 128);
1424        assert_eq!(
1425            config.coarse_centroids_path.as_deref(),
1426            Some("centroids.bin")
1427        );
1428        assert_eq!(config.nprobe, 64);
1429    }
1430
1431    #[test]
1432    fn test_dense_vector_no_mrl_dim() {
1433        // Test that index_dim() returns full dim when mrl_dim is not set
1434        let sdl = r#"
1435            index documents {
1436                field embedding: dense_vector<dims: 768> [indexed]
1437            }
1438        "#;
1439
1440        let indexes = parse_sdl(sdl).unwrap();
1441        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1442
1443        assert_eq!(config.dim, 768);
1444        assert_eq!(config.mrl_dim, None);
1445        assert_eq!(config.index_dim(), 768);
1446    }
1447
1448    #[test]
1449    fn test_json_field_type() {
1450        let sdl = r#"
1451            index documents {
1452                field title: text [indexed, stored]
1453                field metadata: json [stored]
1454                field extra: json
1455            }
1456        "#;
1457
1458        let indexes = parse_sdl(sdl).unwrap();
1459        let index = &indexes[0];
1460
1461        assert_eq!(index.fields.len(), 3);
1462
1463        // Check JSON field
1464        assert_eq!(index.fields[1].name, "metadata");
1465        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1466        assert!(index.fields[1].stored);
1467        // JSON fields should not be indexed (enforced by add_json_field)
1468
1469        // Check default attributes for JSON field
1470        assert_eq!(index.fields[2].name, "extra");
1471        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1472
1473        // Verify schema conversion
1474        let schema = index.to_schema();
1475        let metadata_field = schema.get_field("metadata").unwrap();
1476        let entry = schema.get_field_entry(metadata_field).unwrap();
1477        assert_eq!(entry.field_type, FieldType::Json);
1478        assert!(!entry.indexed); // JSON fields are never indexed
1479        assert!(entry.stored);
1480    }
1481
1482    #[test]
1483    fn test_sparse_vector_query_config() {
1484        use crate::structures::QueryWeighting;
1485
1486        let sdl = r#"
1487            index documents {
1488                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1489            }
1490        "#;
1491
1492        let indexes = parse_sdl(sdl).unwrap();
1493        let index = &indexes[0];
1494
1495        assert_eq!(index.fields.len(), 1);
1496        assert_eq!(index.fields[0].name, "embedding");
1497        assert!(matches!(
1498            index.fields[0].field_type,
1499            FieldType::SparseVector
1500        ));
1501
1502        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1503        assert_eq!(config.index_size, IndexSize::U16);
1504        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1505
1506        // Check query config
1507        let query_config = config.query_config.as_ref().unwrap();
1508        assert_eq!(
1509            query_config.tokenizer.as_deref(),
1510            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1511        );
1512        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1513
1514        // Verify schema conversion preserves query config
1515        let schema = index.to_schema();
1516        let embedding_field = schema.get_field("embedding").unwrap();
1517        let entry = schema.get_field_entry(embedding_field).unwrap();
1518        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1519        let qc = sv_config.query_config.as_ref().unwrap();
1520        assert_eq!(
1521            qc.tokenizer.as_deref(),
1522            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1523        );
1524        assert_eq!(qc.weighting, QueryWeighting::Idf);
1525    }
1526
1527    #[test]
1528    fn test_sparse_vector_query_config_weighting_one() {
1529        use crate::structures::QueryWeighting;
1530
1531        let sdl = r#"
1532            index documents {
1533                field embedding: sparse_vector [indexed<query<weighting: one>>]
1534            }
1535        "#;
1536
1537        let indexes = parse_sdl(sdl).unwrap();
1538        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1539
1540        let query_config = config.query_config.as_ref().unwrap();
1541        assert!(query_config.tokenizer.is_none());
1542        assert_eq!(query_config.weighting, QueryWeighting::One);
1543    }
1544}