Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65/// Parsed field definition
66#[derive(Debug, Clone)]
67pub struct FieldDef {
68    pub name: String,
69    pub field_type: FieldType,
70    pub indexed: bool,
71    pub stored: bool,
72    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
73    pub tokenizer: Option<String>,
74    /// Whether this field can have multiple values (serialized as array in JSON)
75    pub multi: bool,
76    /// Position tracking mode for phrase queries and multi-field element tracking
77    pub positions: Option<super::schema::PositionMode>,
78    /// Configuration for sparse vector fields
79    pub sparse_vector_config: Option<SparseVectorConfig>,
80    /// Configuration for dense vector fields
81    pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84/// Parsed index definition
85#[derive(Debug, Clone)]
86pub struct IndexDef {
87    pub name: String,
88    pub fields: Vec<FieldDef>,
89    pub default_fields: Vec<String>,
90    /// Query router rules for routing queries to specific fields
91    pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95    /// Convert to a Schema
96    pub fn to_schema(&self) -> Schema {
97        let mut builder = SchemaBuilder::default();
98
99        for field in &self.fields {
100            let f = match field.field_type {
101                FieldType::Text => {
102                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103                    builder.add_text_field_with_tokenizer(
104                        &field.name,
105                        field.indexed,
106                        field.stored,
107                        tokenizer,
108                    )
109                }
110                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114                FieldType::Json => builder.add_json_field(&field.name, field.stored),
115                FieldType::SparseVector => {
116                    if let Some(config) = &field.sparse_vector_config {
117                        builder.add_sparse_vector_field_with_config(
118                            &field.name,
119                            field.indexed,
120                            field.stored,
121                            config.clone(),
122                        )
123                    } else {
124                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125                    }
126                }
127                FieldType::DenseVector => {
128                    // Dense vector dimension must be specified via config
129                    let config = field
130                        .dense_vector_config
131                        .as_ref()
132                        .expect("DenseVector field requires dimension to be specified");
133                    builder.add_dense_vector_field_with_config(
134                        &field.name,
135                        field.indexed,
136                        field.stored,
137                        config.clone(),
138                    )
139                }
140            };
141            if field.multi {
142                builder.set_multi(f, true);
143            }
144            // Set positions: explicit > auto (ordinal for multi vectors)
145            let positions = field.positions.or({
146                // Auto-set ordinal positions for multi-valued vector fields
147                if field.multi
148                    && matches!(
149                        field.field_type,
150                        FieldType::SparseVector | FieldType::DenseVector
151                    )
152                {
153                    Some(super::schema::PositionMode::Ordinal)
154                } else {
155                    None
156                }
157            });
158            if let Some(mode) = positions {
159                builder.set_positions(f, mode);
160            }
161        }
162
163        // Set default fields if specified
164        if !self.default_fields.is_empty() {
165            builder.set_default_fields(self.default_fields.clone());
166        }
167
168        // Set query routers if specified
169        if !self.query_routers.is_empty() {
170            builder.set_query_routers(self.query_routers.clone());
171        }
172
173        builder.build()
174    }
175
176    /// Create a QueryFieldRouter from the query router rules
177    ///
178    /// Returns None if there are no query router rules defined.
179    /// Returns Err if any regex pattern is invalid.
180    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181        if self.query_routers.is_empty() {
182            return Ok(None);
183        }
184
185        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186            .map(Some)
187            .map_err(Error::Schema)
188    }
189}
190
191/// Parse field type from string
192fn parse_field_type(type_str: &str) -> Result<FieldType> {
193    match type_str {
194        "text" | "string" | "str" => Ok(FieldType::Text),
195        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196        "i64" | "int" | "integer" => Ok(FieldType::I64),
197        "f64" | "float" | "double" => Ok(FieldType::F64),
198        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199        "json" => Ok(FieldType::Json),
200        "sparse_vector" => Ok(FieldType::SparseVector),
201        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203    }
204}
205
206/// Index configuration parsed from indexed<...> attribute
207#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209    index_type: Option<super::schema::VectorIndexType>,
210    num_clusters: Option<usize>,
211    nprobe: Option<usize>,
212    mrl_dim: Option<usize>,
213    build_threshold: Option<usize>,
214    // Sparse vector index params
215    quantization: Option<WeightQuantization>,
216    weight_threshold: Option<f32>,
217    // Sparse vector query-time config
218    query_tokenizer: Option<String>,
219    query_weighting: Option<QueryWeighting>,
220    // Position tracking mode for phrase queries
221    positions: Option<super::schema::PositionMode>,
222}
223
224/// Parse attributes from pest pair
225/// Returns (indexed, stored, multi, index_config)
226/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
227/// multi is now inside stored<multi>
228fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
229    let mut indexed = false;
230    let mut stored = false;
231    let mut multi = false;
232    let mut index_config = None;
233
234    for attr in pair.into_inner() {
235        if attr.as_rule() == Rule::attribute {
236            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
237            let mut found_config = false;
238            for inner in attr.clone().into_inner() {
239                match inner.as_rule() {
240                    Rule::indexed_with_config => {
241                        indexed = true;
242                        index_config = Some(parse_index_config(inner));
243                        found_config = true;
244                        break;
245                    }
246                    Rule::stored_with_config => {
247                        stored = true;
248                        multi = true; // stored<multi>
249                        found_config = true;
250                        break;
251                    }
252                    _ => {}
253                }
254            }
255            if !found_config {
256                // Simple attribute
257                match attr.as_str() {
258                    "indexed" => indexed = true,
259                    "stored" => stored = true,
260                    _ => {}
261                }
262            }
263        }
264    }
265
266    (indexed, stored, multi, index_config)
267}
268
269/// Parse index configuration from indexed<...> attribute
270fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
271    let mut config = IndexConfig::default();
272
273    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
274    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
275    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
276
277    for inner in pair.into_inner() {
278        if inner.as_rule() == Rule::index_config_params {
279            for param in inner.into_inner() {
280                if param.as_rule() == Rule::index_config_param {
281                    for p in param.into_inner() {
282                        parse_single_index_config_param(&mut config, p);
283                    }
284                }
285            }
286        }
287    }
288
289    config
290}
291
292/// Parse a single index config parameter
293fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
294    use super::schema::VectorIndexType;
295
296    match p.as_rule() {
297        Rule::index_type_spec => {
298            config.index_type = Some(match p.as_str() {
299                "flat" => VectorIndexType::Flat,
300                "rabitq" => VectorIndexType::RaBitQ,
301                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
302                "scann" => VectorIndexType::ScaNN,
303                _ => VectorIndexType::RaBitQ,
304            });
305        }
306        Rule::index_type_kwarg => {
307            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
308            if let Some(t) = p.into_inner().next() {
309                config.index_type = Some(match t.as_str() {
310                    "flat" => VectorIndexType::Flat,
311                    "rabitq" => VectorIndexType::RaBitQ,
312                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
313                    "scann" => VectorIndexType::ScaNN,
314                    _ => VectorIndexType::RaBitQ,
315                });
316            }
317        }
318        Rule::num_clusters_kwarg => {
319            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
320            if let Some(n) = p.into_inner().next() {
321                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
322            }
323        }
324        Rule::build_threshold_kwarg => {
325            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
326            if let Some(n) = p.into_inner().next() {
327                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
328            }
329        }
330        Rule::nprobe_kwarg => {
331            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
332            if let Some(n) = p.into_inner().next() {
333                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
334            }
335        }
336        Rule::mrl_dim_kwarg => {
337            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
338            if let Some(n) = p.into_inner().next() {
339                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
340            }
341        }
342        Rule::quantization_kwarg => {
343            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
344            if let Some(q) = p.into_inner().next() {
345                config.quantization = Some(match q.as_str() {
346                    "float32" | "f32" => WeightQuantization::Float32,
347                    "float16" | "f16" => WeightQuantization::Float16,
348                    "uint8" | "u8" => WeightQuantization::UInt8,
349                    "uint4" | "u4" => WeightQuantization::UInt4,
350                    _ => WeightQuantization::default(),
351                });
352            }
353        }
354        Rule::weight_threshold_kwarg => {
355            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
356            if let Some(t) = p.into_inner().next() {
357                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
358            }
359        }
360        Rule::query_config_block => {
361            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
362            parse_query_config_block(config, p);
363        }
364        Rule::positions_kwarg => {
365            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
366            use super::schema::PositionMode;
367            config.positions = Some(match p.as_str() {
368                "ordinal" => PositionMode::Ordinal,
369                "token_position" => PositionMode::TokenPosition,
370                _ => PositionMode::Full, // "positions" or any other value defaults to Full
371            });
372        }
373        _ => {}
374    }
375}
376
377/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
378fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
379    for inner in pair.into_inner() {
380        if inner.as_rule() == Rule::query_config_params {
381            for param in inner.into_inner() {
382                if param.as_rule() == Rule::query_config_param {
383                    for p in param.into_inner() {
384                        match p.as_rule() {
385                            Rule::query_tokenizer_kwarg => {
386                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
387                                if let Some(path) = p.into_inner().next()
388                                    && let Some(inner_path) = path.into_inner().next()
389                                {
390                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
391                                }
392                            }
393                            Rule::query_weighting_kwarg => {
394                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
395                                if let Some(w) = p.into_inner().next() {
396                                    config.query_weighting = Some(match w.as_str() {
397                                        "one" => QueryWeighting::One,
398                                        "idf" => QueryWeighting::Idf,
399                                        _ => QueryWeighting::One,
400                                    });
401                                }
402                            }
403                            _ => {}
404                        }
405                    }
406                }
407            }
408        }
409    }
410}
411
412/// Parse a field definition from pest pair
413fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
414    let mut inner = pair.into_inner();
415
416    let name = inner
417        .next()
418        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
419        .as_str()
420        .to_string();
421
422    let field_type_str = inner
423        .next()
424        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
425        .as_str();
426
427    let field_type = parse_field_type(field_type_str)?;
428
429    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
430    let mut tokenizer = None;
431    let mut sparse_vector_config = None;
432    let mut dense_vector_config = None;
433    let mut indexed = true;
434    let mut stored = true;
435    let mut multi = false;
436    let mut index_config: Option<IndexConfig> = None;
437
438    for item in inner {
439        match item.as_rule() {
440            Rule::tokenizer_spec => {
441                // Extract tokenizer name from <name>
442                if let Some(tok_name) = item.into_inner().next() {
443                    tokenizer = Some(tok_name.as_str().to_string());
444                }
445            }
446            Rule::sparse_vector_config => {
447                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
448                sparse_vector_config = Some(parse_sparse_vector_config(item));
449            }
450            Rule::dense_vector_config => {
451                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
452                dense_vector_config = Some(parse_dense_vector_config(item));
453            }
454            Rule::attributes => {
455                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
456                indexed = idx;
457                stored = sto;
458                multi = mul;
459                index_config = idx_cfg;
460            }
461            _ => {}
462        }
463    }
464
465    // Merge index config into vector configs if both exist
466    let mut positions = None;
467    if let Some(idx_cfg) = index_config {
468        positions = idx_cfg.positions;
469        if let Some(ref mut dv_config) = dense_vector_config {
470            apply_index_config_to_dense_vector(dv_config, idx_cfg);
471        } else if field_type == FieldType::SparseVector {
472            // For sparse vectors, create default config if not present and apply index params
473            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
474            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
475        }
476    }
477
478    Ok(FieldDef {
479        name,
480        field_type,
481        indexed,
482        stored,
483        tokenizer,
484        multi,
485        positions,
486        sparse_vector_config,
487        dense_vector_config,
488    })
489}
490
491/// Apply index configuration from indexed<...> to DenseVectorConfig
492fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
493    // Apply index type if specified
494    if let Some(index_type) = idx_cfg.index_type {
495        config.index_type = index_type;
496    }
497
498    // Apply num_clusters for IVF-based indexes
499    if idx_cfg.num_clusters.is_some() {
500        config.num_clusters = idx_cfg.num_clusters;
501    }
502
503    // Apply nprobe if specified
504    if let Some(nprobe) = idx_cfg.nprobe {
505        config.nprobe = nprobe;
506    }
507
508    // Apply mrl_dim if specified
509    if idx_cfg.mrl_dim.is_some() {
510        config.mrl_dim = idx_cfg.mrl_dim;
511    }
512
513    // Apply build_threshold if specified
514    if idx_cfg.build_threshold.is_some() {
515        config.build_threshold = idx_cfg.build_threshold;
516    }
517}
518
519/// Parse sparse_vector_config - only index_size (positional)
520/// Example: <u16> or <u32>
521fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
522    let mut index_size = IndexSize::default();
523
524    // Parse positional index_size_spec
525    for inner in pair.into_inner() {
526        if inner.as_rule() == Rule::index_size_spec {
527            index_size = match inner.as_str() {
528                "u16" => IndexSize::U16,
529                "u32" => IndexSize::U32,
530                _ => IndexSize::default(),
531            };
532        }
533    }
534
535    SparseVectorConfig {
536        index_size,
537        weight_quantization: WeightQuantization::default(),
538        weight_threshold: 0.0,
539        posting_list_pruning: None,
540        query_config: None,
541    }
542}
543
544/// Apply index configuration from indexed<...> to SparseVectorConfig
545fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
546    if let Some(q) = idx_cfg.quantization {
547        config.weight_quantization = q;
548    }
549    if let Some(t) = idx_cfg.weight_threshold {
550        config.weight_threshold = t;
551    }
552    // Apply query-time configuration if present
553    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
554        let query_config = config
555            .query_config
556            .get_or_insert(SparseQueryConfig::default());
557        if let Some(tokenizer) = idx_cfg.query_tokenizer {
558            query_config.tokenizer = Some(tokenizer);
559        }
560        if let Some(weighting) = idx_cfg.query_weighting {
561            query_config.weighting = weighting;
562        }
563    }
564}
565
566/// Parse dense_vector_config - only dims
567/// All index-related params (including mrl_dim) are now in indexed<...> attribute
568fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
569    let mut dim: usize = 0;
570
571    // Navigate to dense_vector_params
572    for params in pair.into_inner() {
573        if params.as_rule() == Rule::dense_vector_params {
574            for inner in params.into_inner() {
575                match inner.as_rule() {
576                    Rule::dense_vector_keyword_params => {
577                        // Parse keyword args: dims: N
578                        for kwarg in inner.into_inner() {
579                            if kwarg.as_rule() == Rule::dims_kwarg
580                                && let Some(d) = kwarg.into_inner().next()
581                            {
582                                dim = d.as_str().parse().unwrap_or(0);
583                            }
584                        }
585                    }
586                    Rule::dense_vector_positional_params => {
587                        // Parse positional: just dimension
588                        if let Some(dim_pair) = inner.into_inner().next() {
589                            dim = dim_pair.as_str().parse().unwrap_or(0);
590                        }
591                    }
592                    _ => {}
593                }
594            }
595        }
596    }
597
598    DenseVectorConfig::new(dim)
599}
600
601/// Parse default_fields definition
602fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
603    pair.into_inner().map(|p| p.as_str().to_string()).collect()
604}
605
606/// Parse a query router definition
607fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
608    let mut pattern = String::new();
609    let mut substitution = String::new();
610    let mut target_field = String::new();
611    let mut mode = RoutingMode::Additional;
612
613    for prop in pair.into_inner() {
614        if prop.as_rule() != Rule::query_router_prop {
615            continue;
616        }
617
618        for inner in prop.into_inner() {
619            match inner.as_rule() {
620                Rule::query_router_pattern => {
621                    if let Some(regex_str) = inner.into_inner().next() {
622                        pattern = parse_string_value(regex_str);
623                    }
624                }
625                Rule::query_router_substitution => {
626                    if let Some(quoted) = inner.into_inner().next() {
627                        substitution = parse_string_value(quoted);
628                    }
629                }
630                Rule::query_router_target => {
631                    if let Some(ident) = inner.into_inner().next() {
632                        target_field = ident.as_str().to_string();
633                    }
634                }
635                Rule::query_router_mode => {
636                    if let Some(mode_val) = inner.into_inner().next() {
637                        mode = match mode_val.as_str() {
638                            "exclusive" => RoutingMode::Exclusive,
639                            "additional" => RoutingMode::Additional,
640                            _ => RoutingMode::Additional,
641                        };
642                    }
643                }
644                _ => {}
645            }
646        }
647    }
648
649    if pattern.is_empty() {
650        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
651    }
652    if substitution.is_empty() {
653        return Err(Error::Schema(
654            "query_router missing 'substitution'".to_string(),
655        ));
656    }
657    if target_field.is_empty() {
658        return Err(Error::Schema(
659            "query_router missing 'target_field'".to_string(),
660        ));
661    }
662
663    Ok(QueryRouterRule {
664        pattern,
665        substitution,
666        target_field,
667        mode,
668    })
669}
670
671/// Parse a string value from quoted_string, raw_string, or regex_string
672fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
673    let s = pair.as_str();
674    match pair.as_rule() {
675        Rule::regex_string => {
676            // regex_string contains either raw_string or quoted_string
677            if let Some(inner) = pair.into_inner().next() {
678                parse_string_value(inner)
679            } else {
680                s.to_string()
681            }
682        }
683        Rule::raw_string => {
684            // r"..." - strip r" prefix and " suffix
685            s[2..s.len() - 1].to_string()
686        }
687        Rule::quoted_string => {
688            // "..." - strip quotes and handle escapes
689            let inner = &s[1..s.len() - 1];
690            // Simple escape handling
691            inner
692                .replace("\\n", "\n")
693                .replace("\\t", "\t")
694                .replace("\\\"", "\"")
695                .replace("\\\\", "\\")
696        }
697        _ => s.to_string(),
698    }
699}
700
701/// Parse an index definition from pest pair
702fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
703    let mut inner = pair.into_inner();
704
705    let name = inner
706        .next()
707        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
708        .as_str()
709        .to_string();
710
711    let mut fields = Vec::new();
712    let mut default_fields = Vec::new();
713    let mut query_routers = Vec::new();
714
715    for item in inner {
716        match item.as_rule() {
717            Rule::field_def => {
718                fields.push(parse_field_def(item)?);
719            }
720            Rule::default_fields_def => {
721                default_fields = parse_default_fields_def(item);
722            }
723            Rule::query_router_def => {
724                query_routers.push(parse_query_router_def(item)?);
725            }
726            _ => {}
727        }
728    }
729
730    Ok(IndexDef {
731        name,
732        fields,
733        default_fields,
734        query_routers,
735    })
736}
737
738/// Parse SDL from a string
739pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
740    let pairs = SdlParser::parse(Rule::file, input)
741        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
742
743    let mut indexes = Vec::new();
744
745    for pair in pairs {
746        if pair.as_rule() == Rule::file {
747            for inner in pair.into_inner() {
748                if inner.as_rule() == Rule::index_def {
749                    indexes.push(parse_index_def(inner)?);
750                }
751            }
752        }
753    }
754
755    Ok(indexes)
756}
757
758/// Parse SDL and return a single index definition
759pub fn parse_single_index(input: &str) -> Result<IndexDef> {
760    let indexes = parse_sdl(input)?;
761
762    if indexes.is_empty() {
763        return Err(Error::Schema("No index definition found".to_string()));
764    }
765
766    if indexes.len() > 1 {
767        return Err(Error::Schema(
768            "Multiple index definitions found, expected one".to_string(),
769        ));
770    }
771
772    Ok(indexes.into_iter().next().unwrap())
773}
774
775#[cfg(test)]
776mod tests {
777    use super::*;
778
779    #[test]
780    fn test_parse_simple_schema() {
781        let sdl = r#"
782            index articles {
783                field title: text [indexed, stored]
784                field body: text [indexed]
785            }
786        "#;
787
788        let indexes = parse_sdl(sdl).unwrap();
789        assert_eq!(indexes.len(), 1);
790
791        let index = &indexes[0];
792        assert_eq!(index.name, "articles");
793        assert_eq!(index.fields.len(), 2);
794
795        assert_eq!(index.fields[0].name, "title");
796        assert!(matches!(index.fields[0].field_type, FieldType::Text));
797        assert!(index.fields[0].indexed);
798        assert!(index.fields[0].stored);
799
800        assert_eq!(index.fields[1].name, "body");
801        assert!(matches!(index.fields[1].field_type, FieldType::Text));
802        assert!(index.fields[1].indexed);
803        assert!(!index.fields[1].stored);
804    }
805
806    #[test]
807    fn test_parse_all_field_types() {
808        let sdl = r#"
809            index test {
810                field text_field: text [indexed, stored]
811                field u64_field: u64 [indexed, stored]
812                field i64_field: i64 [indexed, stored]
813                field f64_field: f64 [indexed, stored]
814                field bytes_field: bytes [stored]
815            }
816        "#;
817
818        let indexes = parse_sdl(sdl).unwrap();
819        let index = &indexes[0];
820
821        assert!(matches!(index.fields[0].field_type, FieldType::Text));
822        assert!(matches!(index.fields[1].field_type, FieldType::U64));
823        assert!(matches!(index.fields[2].field_type, FieldType::I64));
824        assert!(matches!(index.fields[3].field_type, FieldType::F64));
825        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
826    }
827
828    #[test]
829    fn test_parse_with_comments() {
830        let sdl = r#"
831            # This is a comment
832            index articles {
833                # Title field
834                field title: text [indexed, stored]
835                field body: text [indexed] # inline comment not supported yet
836            }
837        "#;
838
839        let indexes = parse_sdl(sdl).unwrap();
840        assert_eq!(indexes[0].fields.len(), 2);
841    }
842
843    #[test]
844    fn test_parse_type_aliases() {
845        let sdl = r#"
846            index test {
847                field a: string [indexed]
848                field b: int [indexed]
849                field c: uint [indexed]
850                field d: float [indexed]
851                field e: binary [stored]
852            }
853        "#;
854
855        let indexes = parse_sdl(sdl).unwrap();
856        let index = &indexes[0];
857
858        assert!(matches!(index.fields[0].field_type, FieldType::Text));
859        assert!(matches!(index.fields[1].field_type, FieldType::I64));
860        assert!(matches!(index.fields[2].field_type, FieldType::U64));
861        assert!(matches!(index.fields[3].field_type, FieldType::F64));
862        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
863    }
864
865    #[test]
866    fn test_to_schema() {
867        let sdl = r#"
868            index articles {
869                field title: text [indexed, stored]
870                field views: u64 [indexed, stored]
871            }
872        "#;
873
874        let indexes = parse_sdl(sdl).unwrap();
875        let schema = indexes[0].to_schema();
876
877        assert!(schema.get_field("title").is_some());
878        assert!(schema.get_field("views").is_some());
879        assert!(schema.get_field("nonexistent").is_none());
880    }
881
882    #[test]
883    fn test_default_attributes() {
884        let sdl = r#"
885            index test {
886                field title: text
887            }
888        "#;
889
890        let indexes = parse_sdl(sdl).unwrap();
891        let field = &indexes[0].fields[0];
892
893        // Default should be indexed and stored
894        assert!(field.indexed);
895        assert!(field.stored);
896    }
897
898    #[test]
899    fn test_multiple_indexes() {
900        let sdl = r#"
901            index articles {
902                field title: text [indexed, stored]
903            }
904
905            index users {
906                field name: text [indexed, stored]
907                field email: text [indexed, stored]
908            }
909        "#;
910
911        let indexes = parse_sdl(sdl).unwrap();
912        assert_eq!(indexes.len(), 2);
913        assert_eq!(indexes[0].name, "articles");
914        assert_eq!(indexes[1].name, "users");
915    }
916
917    #[test]
918    fn test_tokenizer_spec() {
919        let sdl = r#"
920            index articles {
921                field title: text<en_stem> [indexed, stored]
922                field body: text<default> [indexed]
923                field author: text [indexed, stored]
924            }
925        "#;
926
927        let indexes = parse_sdl(sdl).unwrap();
928        let index = &indexes[0];
929
930        assert_eq!(index.fields[0].name, "title");
931        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
932
933        assert_eq!(index.fields[1].name, "body");
934        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
935
936        assert_eq!(index.fields[2].name, "author");
937        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
938    }
939
940    #[test]
941    fn test_tokenizer_in_schema() {
942        let sdl = r#"
943            index articles {
944                field title: text<german> [indexed, stored]
945                field body: text<en_stem> [indexed]
946            }
947        "#;
948
949        let indexes = parse_sdl(sdl).unwrap();
950        let schema = indexes[0].to_schema();
951
952        let title_field = schema.get_field("title").unwrap();
953        let title_entry = schema.get_field_entry(title_field).unwrap();
954        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
955
956        let body_field = schema.get_field("body").unwrap();
957        let body_entry = schema.get_field_entry(body_field).unwrap();
958        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
959    }
960
961    #[test]
962    fn test_query_router_basic() {
963        let sdl = r#"
964            index documents {
965                field title: text [indexed, stored]
966                field uri: text [indexed, stored]
967
968                query_router {
969                    pattern: "10\\.\\d{4,}/[^\\s]+"
970                    substitution: "doi://{0}"
971                    target_field: uris
972                    mode: exclusive
973                }
974            }
975        "#;
976
977        let indexes = parse_sdl(sdl).unwrap();
978        let index = &indexes[0];
979
980        assert_eq!(index.query_routers.len(), 1);
981        let router = &index.query_routers[0];
982        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
983        assert_eq!(router.substitution, "doi://{0}");
984        assert_eq!(router.target_field, "uris");
985        assert_eq!(router.mode, RoutingMode::Exclusive);
986    }
987
988    #[test]
989    fn test_query_router_raw_string() {
990        let sdl = r#"
991            index documents {
992                field uris: text [indexed, stored]
993
994                query_router {
995                    pattern: r"^pmid:(\d+)$"
996                    substitution: "pubmed://{1}"
997                    target_field: uris
998                    mode: additional
999                }
1000            }
1001        "#;
1002
1003        let indexes = parse_sdl(sdl).unwrap();
1004        let router = &indexes[0].query_routers[0];
1005
1006        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1007        assert_eq!(router.substitution, "pubmed://{1}");
1008        assert_eq!(router.mode, RoutingMode::Additional);
1009    }
1010
1011    #[test]
1012    fn test_multiple_query_routers() {
1013        let sdl = r#"
1014            index documents {
1015                field uris: text [indexed, stored]
1016
1017                query_router {
1018                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1019                    substitution: "doi://{1}"
1020                    target_field: uris
1021                    mode: exclusive
1022                }
1023
1024                query_router {
1025                    pattern: r"^pmid:(\d+)$"
1026                    substitution: "pubmed://{1}"
1027                    target_field: uris
1028                    mode: exclusive
1029                }
1030
1031                query_router {
1032                    pattern: r"^arxiv:(\d+\.\d+)$"
1033                    substitution: "arxiv://{1}"
1034                    target_field: uris
1035                    mode: additional
1036                }
1037            }
1038        "#;
1039
1040        let indexes = parse_sdl(sdl).unwrap();
1041        assert_eq!(indexes[0].query_routers.len(), 3);
1042    }
1043
1044    #[test]
1045    fn test_query_router_default_mode() {
1046        let sdl = r#"
1047            index documents {
1048                field uris: text [indexed, stored]
1049
1050                query_router {
1051                    pattern: r"test"
1052                    substitution: "{0}"
1053                    target_field: uris
1054                }
1055            }
1056        "#;
1057
1058        let indexes = parse_sdl(sdl).unwrap();
1059        // Default mode should be Additional
1060        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1061    }
1062
1063    #[test]
1064    fn test_multi_attribute() {
1065        let sdl = r#"
1066            index documents {
1067                field uris: text [indexed, stored<multi>]
1068                field title: text [indexed, stored]
1069            }
1070        "#;
1071
1072        let indexes = parse_sdl(sdl).unwrap();
1073        assert_eq!(indexes.len(), 1);
1074
1075        let fields = &indexes[0].fields;
1076        assert_eq!(fields.len(), 2);
1077
1078        // uris should have multi=true
1079        assert_eq!(fields[0].name, "uris");
1080        assert!(fields[0].multi, "uris field should have multi=true");
1081
1082        // title should have multi=false
1083        assert_eq!(fields[1].name, "title");
1084        assert!(!fields[1].multi, "title field should have multi=false");
1085
1086        // Verify schema conversion preserves multi attribute
1087        let schema = indexes[0].to_schema();
1088        let uris_field = schema.get_field("uris").unwrap();
1089        let title_field = schema.get_field("title").unwrap();
1090
1091        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1092        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1093    }
1094
1095    #[test]
1096    fn test_sparse_vector_field() {
1097        let sdl = r#"
1098            index documents {
1099                field embedding: sparse_vector [indexed, stored]
1100            }
1101        "#;
1102
1103        let indexes = parse_sdl(sdl).unwrap();
1104        assert_eq!(indexes.len(), 1);
1105        assert_eq!(indexes[0].fields.len(), 1);
1106        assert_eq!(indexes[0].fields[0].name, "embedding");
1107        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1108        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1109    }
1110
1111    #[test]
1112    fn test_sparse_vector_with_config() {
1113        let sdl = r#"
1114            index documents {
1115                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1116                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1117            }
1118        "#;
1119
1120        let indexes = parse_sdl(sdl).unwrap();
1121        assert_eq!(indexes[0].fields.len(), 2);
1122
1123        // First field: u16 indices, uint8 quantization
1124        let f1 = &indexes[0].fields[0];
1125        assert_eq!(f1.name, "embedding");
1126        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1127        assert_eq!(config1.index_size, IndexSize::U16);
1128        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1129
1130        // Second field: u32 indices, float32 quantization
1131        let f2 = &indexes[0].fields[1];
1132        assert_eq!(f2.name, "dense");
1133        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1134        assert_eq!(config2.index_size, IndexSize::U32);
1135        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1136    }
1137
1138    #[test]
1139    fn test_sparse_vector_with_weight_threshold() {
1140        let sdl = r#"
1141            index documents {
1142                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1143                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1144            }
1145        "#;
1146
1147        let indexes = parse_sdl(sdl).unwrap();
1148        assert_eq!(indexes[0].fields.len(), 2);
1149
1150        // First field: u16 indices, uint8 quantization, threshold 0.1
1151        let f1 = &indexes[0].fields[0];
1152        assert_eq!(f1.name, "embedding");
1153        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1154        assert_eq!(config1.index_size, IndexSize::U16);
1155        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1156        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1157
1158        // Second field: u32 indices, float16 quantization, threshold 0.05
1159        let f2 = &indexes[0].fields[1];
1160        assert_eq!(f2.name, "embedding2");
1161        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1162        assert_eq!(config2.index_size, IndexSize::U32);
1163        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1164        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1165    }
1166
1167    #[test]
1168    fn test_dense_vector_field() {
1169        let sdl = r#"
1170            index documents {
1171                field embedding: dense_vector<768> [indexed, stored]
1172            }
1173        "#;
1174
1175        let indexes = parse_sdl(sdl).unwrap();
1176        assert_eq!(indexes.len(), 1);
1177        assert_eq!(indexes[0].fields.len(), 1);
1178
1179        let f = &indexes[0].fields[0];
1180        assert_eq!(f.name, "embedding");
1181        assert_eq!(f.field_type, FieldType::DenseVector);
1182
1183        let config = f.dense_vector_config.as_ref().unwrap();
1184        assert_eq!(config.dim, 768);
1185    }
1186
1187    #[test]
1188    fn test_dense_vector_alias() {
1189        let sdl = r#"
1190            index documents {
1191                field embedding: vector<1536> [indexed]
1192            }
1193        "#;
1194
1195        let indexes = parse_sdl(sdl).unwrap();
1196        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1197        assert_eq!(
1198            indexes[0].fields[0]
1199                .dense_vector_config
1200                .as_ref()
1201                .unwrap()
1202                .dim,
1203            1536
1204        );
1205    }
1206
1207    #[test]
1208    fn test_dense_vector_with_num_clusters() {
1209        let sdl = r#"
1210            index documents {
1211                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1212            }
1213        "#;
1214
1215        let indexes = parse_sdl(sdl).unwrap();
1216        assert_eq!(indexes.len(), 1);
1217
1218        let f = &indexes[0].fields[0];
1219        assert_eq!(f.name, "embedding");
1220        assert_eq!(f.field_type, FieldType::DenseVector);
1221
1222        let config = f.dense_vector_config.as_ref().unwrap();
1223        assert_eq!(config.dim, 768);
1224        assert_eq!(config.num_clusters, Some(256));
1225        assert_eq!(config.nprobe, 32); // default
1226    }
1227
1228    #[test]
1229    fn test_dense_vector_with_num_clusters_and_nprobe() {
1230        let sdl = r#"
1231            index documents {
1232                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1233            }
1234        "#;
1235
1236        let indexes = parse_sdl(sdl).unwrap();
1237        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1238
1239        assert_eq!(config.dim, 1536);
1240        assert_eq!(config.num_clusters, Some(512));
1241        assert_eq!(config.nprobe, 64);
1242    }
1243
1244    #[test]
1245    fn test_dense_vector_keyword_syntax() {
1246        let sdl = r#"
1247            index documents {
1248                field embedding: dense_vector<dims: 1536> [indexed, stored]
1249            }
1250        "#;
1251
1252        let indexes = parse_sdl(sdl).unwrap();
1253        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1254
1255        assert_eq!(config.dim, 1536);
1256        assert!(config.num_clusters.is_none());
1257    }
1258
1259    #[test]
1260    fn test_dense_vector_keyword_syntax_full() {
1261        let sdl = r#"
1262            index documents {
1263                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1264            }
1265        "#;
1266
1267        let indexes = parse_sdl(sdl).unwrap();
1268        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1269
1270        assert_eq!(config.dim, 1536);
1271        assert_eq!(config.num_clusters, Some(256));
1272        assert_eq!(config.nprobe, 64);
1273    }
1274
1275    #[test]
1276    fn test_dense_vector_keyword_syntax_partial() {
1277        let sdl = r#"
1278            index documents {
1279                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1280            }
1281        "#;
1282
1283        let indexes = parse_sdl(sdl).unwrap();
1284        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1285
1286        assert_eq!(config.dim, 768);
1287        assert_eq!(config.num_clusters, Some(128));
1288        assert_eq!(config.nprobe, 32); // default
1289    }
1290
1291    #[test]
1292    fn test_dense_vector_scann_index() {
1293        use crate::dsl::schema::VectorIndexType;
1294
1295        let sdl = r#"
1296            index documents {
1297                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1298            }
1299        "#;
1300
1301        let indexes = parse_sdl(sdl).unwrap();
1302        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1303
1304        assert_eq!(config.dim, 768);
1305        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1306        assert_eq!(config.num_clusters, Some(256));
1307        assert_eq!(config.nprobe, 64);
1308    }
1309
1310    #[test]
1311    fn test_dense_vector_ivf_rabitq_index() {
1312        use crate::dsl::schema::VectorIndexType;
1313
1314        let sdl = r#"
1315            index documents {
1316                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1317            }
1318        "#;
1319
1320        let indexes = parse_sdl(sdl).unwrap();
1321        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1322
1323        assert_eq!(config.dim, 1536);
1324        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1325        assert_eq!(config.num_clusters, Some(512));
1326    }
1327
1328    #[test]
1329    fn test_dense_vector_rabitq_no_clusters() {
1330        use crate::dsl::schema::VectorIndexType;
1331
1332        let sdl = r#"
1333            index documents {
1334                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1335            }
1336        "#;
1337
1338        let indexes = parse_sdl(sdl).unwrap();
1339        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1340
1341        assert_eq!(config.dim, 768);
1342        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1343        assert!(config.num_clusters.is_none());
1344    }
1345
1346    #[test]
1347    fn test_dense_vector_flat_index() {
1348        use crate::dsl::schema::VectorIndexType;
1349
1350        let sdl = r#"
1351            index documents {
1352                field embedding: dense_vector<dims: 768> [indexed<flat>]
1353            }
1354        "#;
1355
1356        let indexes = parse_sdl(sdl).unwrap();
1357        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1358
1359        assert_eq!(config.dim, 768);
1360        assert_eq!(config.index_type, VectorIndexType::Flat);
1361    }
1362
1363    #[test]
1364    fn test_dense_vector_default_index_type() {
1365        use crate::dsl::schema::VectorIndexType;
1366
1367        // When no index type specified, should default to RaBitQ (basic)
1368        let sdl = r#"
1369            index documents {
1370                field embedding: dense_vector<dims: 768> [indexed]
1371            }
1372        "#;
1373
1374        let indexes = parse_sdl(sdl).unwrap();
1375        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1376
1377        assert_eq!(config.dim, 768);
1378        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1379    }
1380
1381    #[test]
1382    fn test_dense_vector_mrl_dim() {
1383        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1384        let sdl = r#"
1385            index documents {
1386                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1387            }
1388        "#;
1389
1390        let indexes = parse_sdl(sdl).unwrap();
1391        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1392
1393        assert_eq!(config.dim, 1536);
1394        assert_eq!(config.mrl_dim, Some(256));
1395        assert_eq!(config.index_dim(), 256);
1396    }
1397
1398    #[test]
1399    fn test_dense_vector_mrl_dim_with_num_clusters() {
1400        // Test mrl_dim combined with other index options
1401        let sdl = r#"
1402            index documents {
1403                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1404            }
1405        "#;
1406
1407        let indexes = parse_sdl(sdl).unwrap();
1408        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1409
1410        assert_eq!(config.dim, 768);
1411        assert_eq!(config.mrl_dim, Some(128));
1412        assert_eq!(config.index_dim(), 128);
1413        assert_eq!(config.num_clusters, Some(256));
1414        assert_eq!(config.nprobe, 64);
1415    }
1416
1417    #[test]
1418    fn test_dense_vector_no_mrl_dim() {
1419        // Test that index_dim() returns full dim when mrl_dim is not set
1420        let sdl = r#"
1421            index documents {
1422                field embedding: dense_vector<dims: 768> [indexed]
1423            }
1424        "#;
1425
1426        let indexes = parse_sdl(sdl).unwrap();
1427        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1428
1429        assert_eq!(config.dim, 768);
1430        assert_eq!(config.mrl_dim, None);
1431        assert_eq!(config.index_dim(), 768);
1432    }
1433
1434    #[test]
1435    fn test_json_field_type() {
1436        let sdl = r#"
1437            index documents {
1438                field title: text [indexed, stored]
1439                field metadata: json [stored]
1440                field extra: json
1441            }
1442        "#;
1443
1444        let indexes = parse_sdl(sdl).unwrap();
1445        let index = &indexes[0];
1446
1447        assert_eq!(index.fields.len(), 3);
1448
1449        // Check JSON field
1450        assert_eq!(index.fields[1].name, "metadata");
1451        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1452        assert!(index.fields[1].stored);
1453        // JSON fields should not be indexed (enforced by add_json_field)
1454
1455        // Check default attributes for JSON field
1456        assert_eq!(index.fields[2].name, "extra");
1457        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1458
1459        // Verify schema conversion
1460        let schema = index.to_schema();
1461        let metadata_field = schema.get_field("metadata").unwrap();
1462        let entry = schema.get_field_entry(metadata_field).unwrap();
1463        assert_eq!(entry.field_type, FieldType::Json);
1464        assert!(!entry.indexed); // JSON fields are never indexed
1465        assert!(entry.stored);
1466    }
1467
1468    #[test]
1469    fn test_sparse_vector_query_config() {
1470        use crate::structures::QueryWeighting;
1471
1472        let sdl = r#"
1473            index documents {
1474                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1475            }
1476        "#;
1477
1478        let indexes = parse_sdl(sdl).unwrap();
1479        let index = &indexes[0];
1480
1481        assert_eq!(index.fields.len(), 1);
1482        assert_eq!(index.fields[0].name, "embedding");
1483        assert!(matches!(
1484            index.fields[0].field_type,
1485            FieldType::SparseVector
1486        ));
1487
1488        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1489        assert_eq!(config.index_size, IndexSize::U16);
1490        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1491
1492        // Check query config
1493        let query_config = config.query_config.as_ref().unwrap();
1494        assert_eq!(
1495            query_config.tokenizer.as_deref(),
1496            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1497        );
1498        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1499
1500        // Verify schema conversion preserves query config
1501        let schema = index.to_schema();
1502        let embedding_field = schema.get_field("embedding").unwrap();
1503        let entry = schema.get_field_entry(embedding_field).unwrap();
1504        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1505        let qc = sv_config.query_config.as_ref().unwrap();
1506        assert_eq!(
1507            qc.tokenizer.as_deref(),
1508            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1509        );
1510        assert_eq!(qc.weighting, QueryWeighting::Idf);
1511    }
1512
1513    #[test]
1514    fn test_sparse_vector_query_config_weighting_one() {
1515        use crate::structures::QueryWeighting;
1516
1517        let sdl = r#"
1518            index documents {
1519                field embedding: sparse_vector [indexed<query<weighting: one>>]
1520            }
1521        "#;
1522
1523        let indexes = parse_sdl(sdl).unwrap();
1524        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1525
1526        let query_config = config.query_config.as_ref().unwrap();
1527        assert!(query_config.tokenizer.is_none());
1528        assert_eq!(query_config.weighting, QueryWeighting::One);
1529    }
1530}