Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62/// Parsed field definition
63#[derive(Debug, Clone)]
64pub struct FieldDef {
65    pub name: String,
66    pub field_type: FieldType,
67    pub indexed: bool,
68    pub stored: bool,
69    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
70    pub tokenizer: Option<String>,
71    /// Whether this field can have multiple values (serialized as array in JSON)
72    pub multi: bool,
73    /// Position tracking mode for phrase queries and multi-field element tracking
74    pub positions: Option<super::schema::PositionMode>,
75    /// Configuration for sparse vector fields
76    pub sparse_vector_config: Option<SparseVectorConfig>,
77    /// Configuration for dense vector fields
78    pub dense_vector_config: Option<DenseVectorConfig>,
79    /// Whether this field has columnar fast-field storage
80    pub fast: bool,
81}
82
83/// Parsed index definition
84#[derive(Debug, Clone)]
85pub struct IndexDef {
86    pub name: String,
87    pub fields: Vec<FieldDef>,
88    pub default_fields: Vec<String>,
89    /// Query router rules for routing queries to specific fields
90    pub query_routers: Vec<QueryRouterRule>,
91}
92
93impl IndexDef {
94    /// Convert to a Schema
95    pub fn to_schema(&self) -> Schema {
96        let mut builder = SchemaBuilder::default();
97
98        for field in &self.fields {
99            let f = match field.field_type {
100                FieldType::Text => {
101                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
102                    builder.add_text_field_with_tokenizer(
103                        &field.name,
104                        field.indexed,
105                        field.stored,
106                        tokenizer,
107                    )
108                }
109                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
110                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
111                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
112                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
113                FieldType::Json => builder.add_json_field(&field.name, field.stored),
114                FieldType::SparseVector => {
115                    if let Some(config) = &field.sparse_vector_config {
116                        builder.add_sparse_vector_field_with_config(
117                            &field.name,
118                            field.indexed,
119                            field.stored,
120                            config.clone(),
121                        )
122                    } else {
123                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
124                    }
125                }
126                FieldType::DenseVector => {
127                    // Dense vector dimension must be specified via config
128                    let config = field
129                        .dense_vector_config
130                        .as_ref()
131                        .expect("DenseVector field requires dimension to be specified");
132                    builder.add_dense_vector_field_with_config(
133                        &field.name,
134                        field.indexed,
135                        field.stored,
136                        config.clone(),
137                    )
138                }
139            };
140            if field.multi {
141                builder.set_multi(f, true);
142            }
143            if field.fast {
144                builder.set_fast(f, true);
145            }
146            // Set positions: explicit > auto (ordinal for multi vectors)
147            let positions = field.positions.or({
148                // Auto-set ordinal positions for multi-valued vector fields
149                if field.multi
150                    && matches!(
151                        field.field_type,
152                        FieldType::SparseVector | FieldType::DenseVector
153                    )
154                {
155                    Some(super::schema::PositionMode::Ordinal)
156                } else {
157                    None
158                }
159            });
160            if let Some(mode) = positions {
161                builder.set_positions(f, mode);
162            }
163        }
164
165        // Set default fields if specified
166        if !self.default_fields.is_empty() {
167            builder.set_default_fields(self.default_fields.clone());
168        }
169
170        // Set query routers if specified
171        if !self.query_routers.is_empty() {
172            builder.set_query_routers(self.query_routers.clone());
173        }
174
175        builder.build()
176    }
177
178    /// Create a QueryFieldRouter from the query router rules
179    ///
180    /// Returns None if there are no query router rules defined.
181    /// Returns Err if any regex pattern is invalid.
182    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
183        if self.query_routers.is_empty() {
184            return Ok(None);
185        }
186
187        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
188            .map(Some)
189            .map_err(Error::Schema)
190    }
191}
192
193/// Parse field type from string
194fn parse_field_type(type_str: &str) -> Result<FieldType> {
195    match type_str {
196        "text" | "string" | "str" => Ok(FieldType::Text),
197        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
198        "i64" | "int" | "integer" => Ok(FieldType::I64),
199        "f64" | "float" | "double" => Ok(FieldType::F64),
200        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
201        "json" => Ok(FieldType::Json),
202        "sparse_vector" => Ok(FieldType::SparseVector),
203        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
204        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
205    }
206}
207
208/// Index configuration parsed from indexed<...> attribute
209#[derive(Debug, Clone, Default)]
210struct IndexConfig {
211    index_type: Option<super::schema::VectorIndexType>,
212    num_clusters: Option<usize>,
213    nprobe: Option<usize>,
214    build_threshold: Option<usize>,
215    // Sparse vector index params
216    quantization: Option<WeightQuantization>,
217    weight_threshold: Option<f32>,
218    block_size: Option<usize>,
219    pruning: Option<f32>,
220    // Sparse vector query-time config
221    query_tokenizer: Option<String>,
222    query_weighting: Option<QueryWeighting>,
223    query_weight_threshold: Option<f32>,
224    query_max_dims: Option<usize>,
225    query_pruning: Option<f32>,
226    // Position tracking mode for phrase queries
227    positions: Option<super::schema::PositionMode>,
228}
229
230/// Parse attributes from pest pair
231/// Returns (indexed, stored, multi, fast, index_config)
232/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
233/// multi is now inside stored<multi>
234fn parse_attributes(
235    pair: pest::iterators::Pair<Rule>,
236) -> (bool, bool, bool, bool, Option<IndexConfig>) {
237    let mut indexed = false;
238    let mut stored = false;
239    let mut multi = false;
240    let mut fast = false;
241    let mut index_config = None;
242
243    for attr in pair.into_inner() {
244        if attr.as_rule() == Rule::attribute {
245            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" | "fast" }
246            let mut found_config = false;
247            for inner in attr.clone().into_inner() {
248                match inner.as_rule() {
249                    Rule::indexed_with_config => {
250                        indexed = true;
251                        index_config = Some(parse_index_config(inner));
252                        found_config = true;
253                        break;
254                    }
255                    Rule::stored_with_config => {
256                        stored = true;
257                        multi = true; // stored<multi>
258                        found_config = true;
259                        break;
260                    }
261                    _ => {}
262                }
263            }
264            if !found_config {
265                // Simple attribute
266                match attr.as_str() {
267                    "indexed" => indexed = true,
268                    "stored" => stored = true,
269                    "fast" => fast = true,
270                    _ => {}
271                }
272            }
273        }
274    }
275
276    (indexed, stored, multi, fast, index_config)
277}
278
279/// Parse index configuration from indexed<...> attribute
280fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
281    let mut config = IndexConfig::default();
282
283    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
284    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
285    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
286
287    for inner in pair.into_inner() {
288        if inner.as_rule() == Rule::index_config_params {
289            for param in inner.into_inner() {
290                if param.as_rule() == Rule::index_config_param {
291                    for p in param.into_inner() {
292                        parse_single_index_config_param(&mut config, p);
293                    }
294                }
295            }
296        }
297    }
298
299    config
300}
301
302/// Parse a single index config parameter
303fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
304    use super::schema::VectorIndexType;
305
306    match p.as_rule() {
307        Rule::index_type_spec => {
308            config.index_type = Some(match p.as_str() {
309                "flat" => VectorIndexType::Flat,
310                "rabitq" => VectorIndexType::RaBitQ,
311                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
312                "scann" => VectorIndexType::ScaNN,
313                _ => VectorIndexType::RaBitQ,
314            });
315        }
316        Rule::index_type_kwarg => {
317            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
318            if let Some(t) = p.into_inner().next() {
319                config.index_type = Some(match t.as_str() {
320                    "flat" => VectorIndexType::Flat,
321                    "rabitq" => VectorIndexType::RaBitQ,
322                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
323                    "scann" => VectorIndexType::ScaNN,
324                    _ => VectorIndexType::RaBitQ,
325                });
326            }
327        }
328        Rule::num_clusters_kwarg => {
329            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
330            if let Some(n) = p.into_inner().next() {
331                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
332            }
333        }
334        Rule::build_threshold_kwarg => {
335            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
336            if let Some(n) = p.into_inner().next() {
337                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
338            }
339        }
340        Rule::nprobe_kwarg => {
341            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
342            if let Some(n) = p.into_inner().next() {
343                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
344            }
345        }
346        Rule::quantization_kwarg => {
347            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
348            if let Some(q) = p.into_inner().next() {
349                config.quantization = Some(match q.as_str() {
350                    "float32" | "f32" => WeightQuantization::Float32,
351                    "float16" | "f16" => WeightQuantization::Float16,
352                    "uint8" | "u8" => WeightQuantization::UInt8,
353                    "uint4" | "u4" => WeightQuantization::UInt4,
354                    _ => WeightQuantization::default(),
355                });
356            }
357        }
358        Rule::weight_threshold_kwarg => {
359            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
360            if let Some(t) = p.into_inner().next() {
361                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
362            }
363        }
364        Rule::block_size_kwarg => {
365            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
366            if let Some(n) = p.into_inner().next() {
367                config.block_size = Some(n.as_str().parse().unwrap_or(128));
368            }
369        }
370        Rule::pruning_kwarg => {
371            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
372            if let Some(f) = p.into_inner().next() {
373                config.pruning = Some(f.as_str().parse().unwrap_or(1.0));
374            }
375        }
376        Rule::query_config_block => {
377            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
378            parse_query_config_block(config, p);
379        }
380        Rule::positions_kwarg => {
381            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
382            use super::schema::PositionMode;
383            config.positions = Some(match p.as_str() {
384                "ordinal" => PositionMode::Ordinal,
385                "token_position" => PositionMode::TokenPosition,
386                _ => PositionMode::Full, // "positions" or any other value defaults to Full
387            });
388        }
389        _ => {}
390    }
391}
392
393/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
394fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
395    for inner in pair.into_inner() {
396        if inner.as_rule() == Rule::query_config_params {
397            for param in inner.into_inner() {
398                if param.as_rule() == Rule::query_config_param {
399                    for p in param.into_inner() {
400                        match p.as_rule() {
401                            Rule::query_tokenizer_kwarg => {
402                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
403                                if let Some(path) = p.into_inner().next()
404                                    && let Some(inner_path) = path.into_inner().next()
405                                {
406                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
407                                }
408                            }
409                            Rule::query_weighting_kwarg => {
410                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
411                                if let Some(w) = p.into_inner().next() {
412                                    config.query_weighting = Some(match w.as_str() {
413                                        "one" => QueryWeighting::One,
414                                        "idf" => QueryWeighting::Idf,
415                                        "idf_file" => QueryWeighting::IdfFile,
416                                        _ => QueryWeighting::One,
417                                    });
418                                }
419                            }
420                            Rule::query_weight_threshold_kwarg => {
421                                if let Some(t) = p.into_inner().next() {
422                                    config.query_weight_threshold =
423                                        Some(t.as_str().parse().unwrap_or(0.0));
424                                }
425                            }
426                            Rule::query_max_dims_kwarg => {
427                                if let Some(t) = p.into_inner().next() {
428                                    config.query_max_dims = Some(t.as_str().parse().unwrap_or(0));
429                                }
430                            }
431                            Rule::query_pruning_kwarg => {
432                                if let Some(t) = p.into_inner().next() {
433                                    config.query_pruning = Some(t.as_str().parse().unwrap_or(1.0));
434                                }
435                            }
436                            _ => {}
437                        }
438                    }
439                }
440            }
441        }
442    }
443}
444
445/// Parse a field definition from pest pair
446fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
447    let mut inner = pair.into_inner();
448
449    let name = inner
450        .next()
451        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
452        .as_str()
453        .to_string();
454
455    let field_type_str = inner
456        .next()
457        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
458        .as_str();
459
460    let field_type = parse_field_type(field_type_str)?;
461
462    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
463    let mut tokenizer = None;
464    let mut sparse_vector_config = None;
465    let mut dense_vector_config = None;
466    let mut indexed = true;
467    let mut stored = true;
468    let mut multi = false;
469    let mut fast = false;
470    let mut index_config: Option<IndexConfig> = None;
471
472    for item in inner {
473        match item.as_rule() {
474            Rule::tokenizer_spec => {
475                // Extract tokenizer name from <name>
476                if let Some(tok_name) = item.into_inner().next() {
477                    tokenizer = Some(tok_name.as_str().to_string());
478                }
479            }
480            Rule::sparse_vector_config => {
481                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
482                sparse_vector_config = Some(parse_sparse_vector_config(item));
483            }
484            Rule::dense_vector_config => {
485                // Parse dense_vector_params (keyword or positional) - only dims
486                dense_vector_config = Some(parse_dense_vector_config(item));
487            }
488            Rule::attributes => {
489                let (idx, sto, mul, fst, idx_cfg) = parse_attributes(item);
490                indexed = idx;
491                stored = sto;
492                multi = mul;
493                fast = fst;
494                index_config = idx_cfg;
495            }
496            _ => {}
497        }
498    }
499
500    // Merge index config into vector configs if both exist
501    let mut positions = None;
502    if let Some(idx_cfg) = index_config {
503        positions = idx_cfg.positions;
504        if let Some(ref mut dv_config) = dense_vector_config {
505            apply_index_config_to_dense_vector(dv_config, idx_cfg);
506        } else if field_type == FieldType::SparseVector {
507            // For sparse vectors, create default config if not present and apply index params
508            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
509            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
510        }
511    }
512
513    Ok(FieldDef {
514        name,
515        field_type,
516        indexed,
517        stored,
518        tokenizer,
519        multi,
520        positions,
521        sparse_vector_config,
522        dense_vector_config,
523        fast,
524    })
525}
526
527/// Apply index configuration from indexed<...> to DenseVectorConfig
528fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
529    // Apply index type if specified
530    if let Some(index_type) = idx_cfg.index_type {
531        config.index_type = index_type;
532    }
533
534    // Apply num_clusters for IVF-based indexes
535    if idx_cfg.num_clusters.is_some() {
536        config.num_clusters = idx_cfg.num_clusters;
537    }
538
539    // Apply nprobe if specified
540    if let Some(nprobe) = idx_cfg.nprobe {
541        config.nprobe = nprobe;
542    }
543
544    // Apply build_threshold if specified
545    if idx_cfg.build_threshold.is_some() {
546        config.build_threshold = idx_cfg.build_threshold;
547    }
548}
549
550/// Parse sparse_vector_config - only index_size (positional)
551/// Example: <u16> or <u32>
552fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
553    let mut index_size = IndexSize::default();
554
555    // Parse positional index_size_spec
556    for inner in pair.into_inner() {
557        if inner.as_rule() == Rule::index_size_spec {
558            index_size = match inner.as_str() {
559                "u16" => IndexSize::U16,
560                "u32" => IndexSize::U32,
561                _ => IndexSize::default(),
562            };
563        }
564    }
565
566    SparseVectorConfig {
567        index_size,
568        weight_quantization: WeightQuantization::default(),
569        weight_threshold: 0.0,
570        block_size: 128,
571        pruning: None,
572        query_config: None,
573    }
574}
575
576/// Apply index configuration from indexed<...> to SparseVectorConfig
577fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
578    if let Some(q) = idx_cfg.quantization {
579        config.weight_quantization = q;
580    }
581    if let Some(t) = idx_cfg.weight_threshold {
582        config.weight_threshold = t;
583    }
584    if let Some(bs) = idx_cfg.block_size {
585        let adjusted = bs.next_power_of_two();
586        if adjusted != bs {
587            log::warn!(
588                "block_size {} adjusted to next power of two: {}",
589                bs,
590                adjusted
591            );
592        }
593        config.block_size = adjusted;
594    }
595    if let Some(p) = idx_cfg.pruning {
596        let clamped = p.clamp(0.0, 1.0);
597        if (clamped - p).abs() > f32::EPSILON {
598            log::warn!(
599                "pruning {} clamped to valid range [0.0, 1.0]: {}",
600                p,
601                clamped
602            );
603        }
604        config.pruning = Some(clamped);
605    }
606    // Apply query-time configuration if present
607    if idx_cfg.query_tokenizer.is_some()
608        || idx_cfg.query_weighting.is_some()
609        || idx_cfg.query_weight_threshold.is_some()
610        || idx_cfg.query_max_dims.is_some()
611        || idx_cfg.query_pruning.is_some()
612    {
613        let query_config = config
614            .query_config
615            .get_or_insert(SparseQueryConfig::default());
616        if let Some(tokenizer) = idx_cfg.query_tokenizer {
617            query_config.tokenizer = Some(tokenizer);
618        }
619        if let Some(weighting) = idx_cfg.query_weighting {
620            query_config.weighting = weighting;
621        }
622        if let Some(t) = idx_cfg.query_weight_threshold {
623            query_config.weight_threshold = t;
624        }
625        if let Some(d) = idx_cfg.query_max_dims {
626            query_config.max_query_dims = Some(d);
627        }
628        if let Some(p) = idx_cfg.query_pruning {
629            query_config.pruning = Some(p);
630        }
631    }
632}
633
634/// Parse dense_vector_config - dims and optional quantization type
635/// All index-related params are in indexed<...> attribute
636fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
637    let mut dim: usize = 0;
638    let mut quantization = DenseVectorQuantization::F32;
639
640    // Navigate to dense_vector_params
641    for params in pair.into_inner() {
642        if params.as_rule() == Rule::dense_vector_params {
643            for inner in params.into_inner() {
644                match inner.as_rule() {
645                    Rule::dense_vector_keyword_params => {
646                        for kwarg in inner.into_inner() {
647                            match kwarg.as_rule() {
648                                Rule::dims_kwarg => {
649                                    if let Some(d) = kwarg.into_inner().next() {
650                                        dim = d.as_str().parse().unwrap_or(0);
651                                    }
652                                }
653                                Rule::quant_type_spec => {
654                                    quantization = parse_quant_type(kwarg.as_str());
655                                }
656                                _ => {}
657                            }
658                        }
659                    }
660                    Rule::dense_vector_positional_params => {
661                        for item in inner.into_inner() {
662                            match item.as_rule() {
663                                Rule::dimension_spec => {
664                                    dim = item.as_str().parse().unwrap_or(0);
665                                }
666                                Rule::quant_type_spec => {
667                                    quantization = parse_quant_type(item.as_str());
668                                }
669                                _ => {}
670                            }
671                        }
672                    }
673                    _ => {}
674                }
675            }
676        }
677    }
678
679    DenseVectorConfig::new(dim).with_quantization(quantization)
680}
681
682fn parse_quant_type(s: &str) -> DenseVectorQuantization {
683    match s.trim() {
684        "f16" => DenseVectorQuantization::F16,
685        "uint8" | "u8" => DenseVectorQuantization::UInt8,
686        _ => DenseVectorQuantization::F32,
687    }
688}
689
690/// Parse default_fields definition
691fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
692    pair.into_inner().map(|p| p.as_str().to_string()).collect()
693}
694
695/// Parse a query router definition
696fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
697    let mut pattern = String::new();
698    let mut substitution = String::new();
699    let mut target_field = String::new();
700    let mut mode = RoutingMode::Additional;
701
702    for prop in pair.into_inner() {
703        if prop.as_rule() != Rule::query_router_prop {
704            continue;
705        }
706
707        for inner in prop.into_inner() {
708            match inner.as_rule() {
709                Rule::query_router_pattern => {
710                    if let Some(regex_str) = inner.into_inner().next() {
711                        pattern = parse_string_value(regex_str);
712                    }
713                }
714                Rule::query_router_substitution => {
715                    if let Some(quoted) = inner.into_inner().next() {
716                        substitution = parse_string_value(quoted);
717                    }
718                }
719                Rule::query_router_target => {
720                    if let Some(ident) = inner.into_inner().next() {
721                        target_field = ident.as_str().to_string();
722                    }
723                }
724                Rule::query_router_mode => {
725                    if let Some(mode_val) = inner.into_inner().next() {
726                        mode = match mode_val.as_str() {
727                            "exclusive" => RoutingMode::Exclusive,
728                            "additional" => RoutingMode::Additional,
729                            _ => RoutingMode::Additional,
730                        };
731                    }
732                }
733                _ => {}
734            }
735        }
736    }
737
738    if pattern.is_empty() {
739        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
740    }
741    if substitution.is_empty() {
742        return Err(Error::Schema(
743            "query_router missing 'substitution'".to_string(),
744        ));
745    }
746    if target_field.is_empty() {
747        return Err(Error::Schema(
748            "query_router missing 'target_field'".to_string(),
749        ));
750    }
751
752    Ok(QueryRouterRule {
753        pattern,
754        substitution,
755        target_field,
756        mode,
757    })
758}
759
760/// Parse a string value from quoted_string, raw_string, or regex_string
761fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
762    let s = pair.as_str();
763    match pair.as_rule() {
764        Rule::regex_string => {
765            // regex_string contains either raw_string or quoted_string
766            if let Some(inner) = pair.into_inner().next() {
767                parse_string_value(inner)
768            } else {
769                s.to_string()
770            }
771        }
772        Rule::raw_string => {
773            // r"..." - strip r" prefix and " suffix
774            s[2..s.len() - 1].to_string()
775        }
776        Rule::quoted_string => {
777            // "..." - strip quotes and handle escapes
778            let inner = &s[1..s.len() - 1];
779            // Simple escape handling
780            inner
781                .replace("\\n", "\n")
782                .replace("\\t", "\t")
783                .replace("\\\"", "\"")
784                .replace("\\\\", "\\")
785        }
786        _ => s.to_string(),
787    }
788}
789
790/// Parse an index definition from pest pair
791fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
792    let mut inner = pair.into_inner();
793
794    let name = inner
795        .next()
796        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
797        .as_str()
798        .to_string();
799
800    let mut fields = Vec::new();
801    let mut default_fields = Vec::new();
802    let mut query_routers = Vec::new();
803
804    for item in inner {
805        match item.as_rule() {
806            Rule::field_def => {
807                fields.push(parse_field_def(item)?);
808            }
809            Rule::default_fields_def => {
810                default_fields = parse_default_fields_def(item);
811            }
812            Rule::query_router_def => {
813                query_routers.push(parse_query_router_def(item)?);
814            }
815            _ => {}
816        }
817    }
818
819    Ok(IndexDef {
820        name,
821        fields,
822        default_fields,
823        query_routers,
824    })
825}
826
827/// Parse SDL from a string
828pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
829    let pairs = SdlParser::parse(Rule::file, input)
830        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
831
832    let mut indexes = Vec::new();
833
834    for pair in pairs {
835        if pair.as_rule() == Rule::file {
836            for inner in pair.into_inner() {
837                if inner.as_rule() == Rule::index_def {
838                    indexes.push(parse_index_def(inner)?);
839                }
840            }
841        }
842    }
843
844    Ok(indexes)
845}
846
847/// Parse SDL and return a single index definition
848pub fn parse_single_index(input: &str) -> Result<IndexDef> {
849    let indexes = parse_sdl(input)?;
850
851    if indexes.is_empty() {
852        return Err(Error::Schema("No index definition found".to_string()));
853    }
854
855    if indexes.len() > 1 {
856        return Err(Error::Schema(
857            "Multiple index definitions found, expected one".to_string(),
858        ));
859    }
860
861    Ok(indexes.into_iter().next().unwrap())
862}
863
864#[cfg(test)]
865mod tests {
866    use super::*;
867
868    #[test]
869    fn test_parse_simple_schema() {
870        let sdl = r#"
871            index articles {
872                field title: text [indexed, stored]
873                field body: text [indexed]
874            }
875        "#;
876
877        let indexes = parse_sdl(sdl).unwrap();
878        assert_eq!(indexes.len(), 1);
879
880        let index = &indexes[0];
881        assert_eq!(index.name, "articles");
882        assert_eq!(index.fields.len(), 2);
883
884        assert_eq!(index.fields[0].name, "title");
885        assert!(matches!(index.fields[0].field_type, FieldType::Text));
886        assert!(index.fields[0].indexed);
887        assert!(index.fields[0].stored);
888
889        assert_eq!(index.fields[1].name, "body");
890        assert!(matches!(index.fields[1].field_type, FieldType::Text));
891        assert!(index.fields[1].indexed);
892        assert!(!index.fields[1].stored);
893    }
894
895    #[test]
896    fn test_parse_all_field_types() {
897        let sdl = r#"
898            index test {
899                field text_field: text [indexed, stored]
900                field u64_field: u64 [indexed, stored]
901                field i64_field: i64 [indexed, stored]
902                field f64_field: f64 [indexed, stored]
903                field bytes_field: bytes [stored]
904            }
905        "#;
906
907        let indexes = parse_sdl(sdl).unwrap();
908        let index = &indexes[0];
909
910        assert!(matches!(index.fields[0].field_type, FieldType::Text));
911        assert!(matches!(index.fields[1].field_type, FieldType::U64));
912        assert!(matches!(index.fields[2].field_type, FieldType::I64));
913        assert!(matches!(index.fields[3].field_type, FieldType::F64));
914        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
915    }
916
917    #[test]
918    fn test_parse_with_comments() {
919        let sdl = r#"
920            # This is a comment
921            index articles {
922                # Title field
923                field title: text [indexed, stored]
924                field body: text [indexed] # inline comment not supported yet
925            }
926        "#;
927
928        let indexes = parse_sdl(sdl).unwrap();
929        assert_eq!(indexes[0].fields.len(), 2);
930    }
931
932    #[test]
933    fn test_parse_type_aliases() {
934        let sdl = r#"
935            index test {
936                field a: string [indexed]
937                field b: int [indexed]
938                field c: uint [indexed]
939                field d: float [indexed]
940                field e: binary [stored]
941            }
942        "#;
943
944        let indexes = parse_sdl(sdl).unwrap();
945        let index = &indexes[0];
946
947        assert!(matches!(index.fields[0].field_type, FieldType::Text));
948        assert!(matches!(index.fields[1].field_type, FieldType::I64));
949        assert!(matches!(index.fields[2].field_type, FieldType::U64));
950        assert!(matches!(index.fields[3].field_type, FieldType::F64));
951        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
952    }
953
954    #[test]
955    fn test_to_schema() {
956        let sdl = r#"
957            index articles {
958                field title: text [indexed, stored]
959                field views: u64 [indexed, stored]
960            }
961        "#;
962
963        let indexes = parse_sdl(sdl).unwrap();
964        let schema = indexes[0].to_schema();
965
966        assert!(schema.get_field("title").is_some());
967        assert!(schema.get_field("views").is_some());
968        assert!(schema.get_field("nonexistent").is_none());
969    }
970
971    #[test]
972    fn test_default_attributes() {
973        let sdl = r#"
974            index test {
975                field title: text
976            }
977        "#;
978
979        let indexes = parse_sdl(sdl).unwrap();
980        let field = &indexes[0].fields[0];
981
982        // Default should be indexed and stored
983        assert!(field.indexed);
984        assert!(field.stored);
985    }
986
987    #[test]
988    fn test_multiple_indexes() {
989        let sdl = r#"
990            index articles {
991                field title: text [indexed, stored]
992            }
993
994            index users {
995                field name: text [indexed, stored]
996                field email: text [indexed, stored]
997            }
998        "#;
999
1000        let indexes = parse_sdl(sdl).unwrap();
1001        assert_eq!(indexes.len(), 2);
1002        assert_eq!(indexes[0].name, "articles");
1003        assert_eq!(indexes[1].name, "users");
1004    }
1005
1006    #[test]
1007    fn test_tokenizer_spec() {
1008        let sdl = r#"
1009            index articles {
1010                field title: text<en_stem> [indexed, stored]
1011                field body: text<default> [indexed]
1012                field author: text [indexed, stored]
1013            }
1014        "#;
1015
1016        let indexes = parse_sdl(sdl).unwrap();
1017        let index = &indexes[0];
1018
1019        assert_eq!(index.fields[0].name, "title");
1020        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1021
1022        assert_eq!(index.fields[1].name, "body");
1023        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
1024
1025        assert_eq!(index.fields[2].name, "author");
1026        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1027    }
1028
1029    #[test]
1030    fn test_tokenizer_in_schema() {
1031        let sdl = r#"
1032            index articles {
1033                field title: text<german> [indexed, stored]
1034                field body: text<en_stem> [indexed]
1035            }
1036        "#;
1037
1038        let indexes = parse_sdl(sdl).unwrap();
1039        let schema = indexes[0].to_schema();
1040
1041        let title_field = schema.get_field("title").unwrap();
1042        let title_entry = schema.get_field_entry(title_field).unwrap();
1043        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1044
1045        let body_field = schema.get_field("body").unwrap();
1046        let body_entry = schema.get_field_entry(body_field).unwrap();
1047        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1048    }
1049
1050    #[test]
1051    fn test_query_router_basic() {
1052        let sdl = r#"
1053            index documents {
1054                field title: text [indexed, stored]
1055                field uri: text [indexed, stored]
1056
1057                query_router {
1058                    pattern: "10\\.\\d{4,}/[^\\s]+"
1059                    substitution: "doi://{0}"
1060                    target_field: uris
1061                    mode: exclusive
1062                }
1063            }
1064        "#;
1065
1066        let indexes = parse_sdl(sdl).unwrap();
1067        let index = &indexes[0];
1068
1069        assert_eq!(index.query_routers.len(), 1);
1070        let router = &index.query_routers[0];
1071        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1072        assert_eq!(router.substitution, "doi://{0}");
1073        assert_eq!(router.target_field, "uris");
1074        assert_eq!(router.mode, RoutingMode::Exclusive);
1075    }
1076
1077    #[test]
1078    fn test_query_router_raw_string() {
1079        let sdl = r#"
1080            index documents {
1081                field uris: text [indexed, stored]
1082
1083                query_router {
1084                    pattern: r"^pmid:(\d+)$"
1085                    substitution: "pubmed://{1}"
1086                    target_field: uris
1087                    mode: additional
1088                }
1089            }
1090        "#;
1091
1092        let indexes = parse_sdl(sdl).unwrap();
1093        let router = &indexes[0].query_routers[0];
1094
1095        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1096        assert_eq!(router.substitution, "pubmed://{1}");
1097        assert_eq!(router.mode, RoutingMode::Additional);
1098    }
1099
1100    #[test]
1101    fn test_multiple_query_routers() {
1102        let sdl = r#"
1103            index documents {
1104                field uris: text [indexed, stored]
1105
1106                query_router {
1107                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1108                    substitution: "doi://{1}"
1109                    target_field: uris
1110                    mode: exclusive
1111                }
1112
1113                query_router {
1114                    pattern: r"^pmid:(\d+)$"
1115                    substitution: "pubmed://{1}"
1116                    target_field: uris
1117                    mode: exclusive
1118                }
1119
1120                query_router {
1121                    pattern: r"^arxiv:(\d+\.\d+)$"
1122                    substitution: "arxiv://{1}"
1123                    target_field: uris
1124                    mode: additional
1125                }
1126            }
1127        "#;
1128
1129        let indexes = parse_sdl(sdl).unwrap();
1130        assert_eq!(indexes[0].query_routers.len(), 3);
1131    }
1132
1133    #[test]
1134    fn test_query_router_default_mode() {
1135        let sdl = r#"
1136            index documents {
1137                field uris: text [indexed, stored]
1138
1139                query_router {
1140                    pattern: r"test"
1141                    substitution: "{0}"
1142                    target_field: uris
1143                }
1144            }
1145        "#;
1146
1147        let indexes = parse_sdl(sdl).unwrap();
1148        // Default mode should be Additional
1149        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1150    }
1151
1152    #[test]
1153    fn test_multi_attribute() {
1154        let sdl = r#"
1155            index documents {
1156                field uris: text [indexed, stored<multi>]
1157                field title: text [indexed, stored]
1158            }
1159        "#;
1160
1161        let indexes = parse_sdl(sdl).unwrap();
1162        assert_eq!(indexes.len(), 1);
1163
1164        let fields = &indexes[0].fields;
1165        assert_eq!(fields.len(), 2);
1166
1167        // uris should have multi=true
1168        assert_eq!(fields[0].name, "uris");
1169        assert!(fields[0].multi, "uris field should have multi=true");
1170
1171        // title should have multi=false
1172        assert_eq!(fields[1].name, "title");
1173        assert!(!fields[1].multi, "title field should have multi=false");
1174
1175        // Verify schema conversion preserves multi attribute
1176        let schema = indexes[0].to_schema();
1177        let uris_field = schema.get_field("uris").unwrap();
1178        let title_field = schema.get_field("title").unwrap();
1179
1180        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1181        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1182    }
1183
1184    #[test]
1185    fn test_sparse_vector_field() {
1186        let sdl = r#"
1187            index documents {
1188                field embedding: sparse_vector [indexed, stored]
1189            }
1190        "#;
1191
1192        let indexes = parse_sdl(sdl).unwrap();
1193        assert_eq!(indexes.len(), 1);
1194        assert_eq!(indexes[0].fields.len(), 1);
1195        assert_eq!(indexes[0].fields[0].name, "embedding");
1196        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1197        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1198    }
1199
1200    #[test]
1201    fn test_sparse_vector_with_config() {
1202        let sdl = r#"
1203            index documents {
1204                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1205                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1206            }
1207        "#;
1208
1209        let indexes = parse_sdl(sdl).unwrap();
1210        assert_eq!(indexes[0].fields.len(), 2);
1211
1212        // First field: u16 indices, uint8 quantization
1213        let f1 = &indexes[0].fields[0];
1214        assert_eq!(f1.name, "embedding");
1215        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1216        assert_eq!(config1.index_size, IndexSize::U16);
1217        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1218
1219        // Second field: u32 indices, float32 quantization
1220        let f2 = &indexes[0].fields[1];
1221        assert_eq!(f2.name, "dense");
1222        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1223        assert_eq!(config2.index_size, IndexSize::U32);
1224        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1225    }
1226
1227    #[test]
1228    fn test_sparse_vector_with_weight_threshold() {
1229        let sdl = r#"
1230            index documents {
1231                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1232                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1233            }
1234        "#;
1235
1236        let indexes = parse_sdl(sdl).unwrap();
1237        assert_eq!(indexes[0].fields.len(), 2);
1238
1239        // First field: u16 indices, uint8 quantization, threshold 0.1
1240        let f1 = &indexes[0].fields[0];
1241        assert_eq!(f1.name, "embedding");
1242        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1243        assert_eq!(config1.index_size, IndexSize::U16);
1244        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1245        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1246
1247        // Second field: u32 indices, float16 quantization, threshold 0.05
1248        let f2 = &indexes[0].fields[1];
1249        assert_eq!(f2.name, "embedding2");
1250        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1251        assert_eq!(config2.index_size, IndexSize::U32);
1252        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1253        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1254    }
1255
1256    #[test]
1257    fn test_sparse_vector_with_pruning() {
1258        let sdl = r#"
1259            index documents {
1260                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1261            }
1262        "#;
1263
1264        let indexes = parse_sdl(sdl).unwrap();
1265        let f = &indexes[0].fields[0];
1266        assert_eq!(f.name, "embedding");
1267        let config = f.sparse_vector_config.as_ref().unwrap();
1268        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1269        assert_eq!(config.pruning, Some(0.1));
1270    }
1271
1272    #[test]
1273    fn test_dense_vector_field() {
1274        let sdl = r#"
1275            index documents {
1276                field embedding: dense_vector<768> [indexed, stored]
1277            }
1278        "#;
1279
1280        let indexes = parse_sdl(sdl).unwrap();
1281        assert_eq!(indexes.len(), 1);
1282        assert_eq!(indexes[0].fields.len(), 1);
1283
1284        let f = &indexes[0].fields[0];
1285        assert_eq!(f.name, "embedding");
1286        assert_eq!(f.field_type, FieldType::DenseVector);
1287
1288        let config = f.dense_vector_config.as_ref().unwrap();
1289        assert_eq!(config.dim, 768);
1290    }
1291
1292    #[test]
1293    fn test_dense_vector_alias() {
1294        let sdl = r#"
1295            index documents {
1296                field embedding: vector<1536> [indexed]
1297            }
1298        "#;
1299
1300        let indexes = parse_sdl(sdl).unwrap();
1301        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1302        assert_eq!(
1303            indexes[0].fields[0]
1304                .dense_vector_config
1305                .as_ref()
1306                .unwrap()
1307                .dim,
1308            1536
1309        );
1310    }
1311
1312    #[test]
1313    fn test_dense_vector_with_num_clusters() {
1314        let sdl = r#"
1315            index documents {
1316                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1317            }
1318        "#;
1319
1320        let indexes = parse_sdl(sdl).unwrap();
1321        assert_eq!(indexes.len(), 1);
1322
1323        let f = &indexes[0].fields[0];
1324        assert_eq!(f.name, "embedding");
1325        assert_eq!(f.field_type, FieldType::DenseVector);
1326
1327        let config = f.dense_vector_config.as_ref().unwrap();
1328        assert_eq!(config.dim, 768);
1329        assert_eq!(config.num_clusters, Some(256));
1330        assert_eq!(config.nprobe, 32); // default
1331    }
1332
1333    #[test]
1334    fn test_dense_vector_with_num_clusters_and_nprobe() {
1335        let sdl = r#"
1336            index documents {
1337                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1338            }
1339        "#;
1340
1341        let indexes = parse_sdl(sdl).unwrap();
1342        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1343
1344        assert_eq!(config.dim, 1536);
1345        assert_eq!(config.num_clusters, Some(512));
1346        assert_eq!(config.nprobe, 64);
1347    }
1348
1349    #[test]
1350    fn test_dense_vector_keyword_syntax() {
1351        let sdl = r#"
1352            index documents {
1353                field embedding: dense_vector<dims: 1536> [indexed, stored]
1354            }
1355        "#;
1356
1357        let indexes = parse_sdl(sdl).unwrap();
1358        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1359
1360        assert_eq!(config.dim, 1536);
1361        assert!(config.num_clusters.is_none());
1362    }
1363
1364    #[test]
1365    fn test_dense_vector_keyword_syntax_full() {
1366        let sdl = r#"
1367            index documents {
1368                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1369            }
1370        "#;
1371
1372        let indexes = parse_sdl(sdl).unwrap();
1373        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1374
1375        assert_eq!(config.dim, 1536);
1376        assert_eq!(config.num_clusters, Some(256));
1377        assert_eq!(config.nprobe, 64);
1378    }
1379
1380    #[test]
1381    fn test_dense_vector_keyword_syntax_partial() {
1382        let sdl = r#"
1383            index documents {
1384                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1385            }
1386        "#;
1387
1388        let indexes = parse_sdl(sdl).unwrap();
1389        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1390
1391        assert_eq!(config.dim, 768);
1392        assert_eq!(config.num_clusters, Some(128));
1393        assert_eq!(config.nprobe, 32); // default
1394    }
1395
1396    #[test]
1397    fn test_dense_vector_scann_index() {
1398        use crate::dsl::schema::VectorIndexType;
1399
1400        let sdl = r#"
1401            index documents {
1402                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1403            }
1404        "#;
1405
1406        let indexes = parse_sdl(sdl).unwrap();
1407        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1408
1409        assert_eq!(config.dim, 768);
1410        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1411        assert_eq!(config.num_clusters, Some(256));
1412        assert_eq!(config.nprobe, 64);
1413    }
1414
1415    #[test]
1416    fn test_dense_vector_ivf_rabitq_index() {
1417        use crate::dsl::schema::VectorIndexType;
1418
1419        let sdl = r#"
1420            index documents {
1421                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1422            }
1423        "#;
1424
1425        let indexes = parse_sdl(sdl).unwrap();
1426        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1427
1428        assert_eq!(config.dim, 1536);
1429        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1430        assert_eq!(config.num_clusters, Some(512));
1431    }
1432
1433    #[test]
1434    fn test_dense_vector_rabitq_no_clusters() {
1435        use crate::dsl::schema::VectorIndexType;
1436
1437        let sdl = r#"
1438            index documents {
1439                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1440            }
1441        "#;
1442
1443        let indexes = parse_sdl(sdl).unwrap();
1444        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1445
1446        assert_eq!(config.dim, 768);
1447        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1448        assert!(config.num_clusters.is_none());
1449    }
1450
1451    #[test]
1452    fn test_dense_vector_flat_index() {
1453        use crate::dsl::schema::VectorIndexType;
1454
1455        let sdl = r#"
1456            index documents {
1457                field embedding: dense_vector<dims: 768> [indexed<flat>]
1458            }
1459        "#;
1460
1461        let indexes = parse_sdl(sdl).unwrap();
1462        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1463
1464        assert_eq!(config.dim, 768);
1465        assert_eq!(config.index_type, VectorIndexType::Flat);
1466    }
1467
1468    #[test]
1469    fn test_dense_vector_default_index_type() {
1470        use crate::dsl::schema::VectorIndexType;
1471
1472        // When no index type specified, should default to RaBitQ (basic)
1473        let sdl = r#"
1474            index documents {
1475                field embedding: dense_vector<dims: 768> [indexed]
1476            }
1477        "#;
1478
1479        let indexes = parse_sdl(sdl).unwrap();
1480        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1481
1482        assert_eq!(config.dim, 768);
1483        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1484    }
1485
1486    #[test]
1487    fn test_dense_vector_f16_quantization() {
1488        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1489
1490        let sdl = r#"
1491            index documents {
1492                field embedding: dense_vector<768, f16> [indexed]
1493            }
1494        "#;
1495
1496        let indexes = parse_sdl(sdl).unwrap();
1497        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1498
1499        assert_eq!(config.dim, 768);
1500        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1501        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1502    }
1503
1504    #[test]
1505    fn test_dense_vector_uint8_quantization() {
1506        use crate::dsl::schema::DenseVectorQuantization;
1507
1508        let sdl = r#"
1509            index documents {
1510                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1511            }
1512        "#;
1513
1514        let indexes = parse_sdl(sdl).unwrap();
1515        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1516
1517        assert_eq!(config.dim, 1024);
1518        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1519    }
1520
1521    #[test]
1522    fn test_dense_vector_u8_alias() {
1523        use crate::dsl::schema::DenseVectorQuantization;
1524
1525        let sdl = r#"
1526            index documents {
1527                field embedding: dense_vector<512, u8> [indexed]
1528            }
1529        "#;
1530
1531        let indexes = parse_sdl(sdl).unwrap();
1532        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1533
1534        assert_eq!(config.dim, 512);
1535        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1536    }
1537
1538    #[test]
1539    fn test_dense_vector_default_f32_quantization() {
1540        use crate::dsl::schema::DenseVectorQuantization;
1541
1542        // No quantization type → default f32
1543        let sdl = r#"
1544            index documents {
1545                field embedding: dense_vector<768> [indexed]
1546            }
1547        "#;
1548
1549        let indexes = parse_sdl(sdl).unwrap();
1550        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1551
1552        assert_eq!(config.dim, 768);
1553        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1554    }
1555
1556    #[test]
1557    fn test_dense_vector_keyword_with_quantization() {
1558        use crate::dsl::schema::DenseVectorQuantization;
1559
1560        let sdl = r#"
1561            index documents {
1562                field embedding: dense_vector<dims: 768, f16> [indexed]
1563            }
1564        "#;
1565
1566        let indexes = parse_sdl(sdl).unwrap();
1567        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1568
1569        assert_eq!(config.dim, 768);
1570        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1571    }
1572
1573    #[test]
1574    fn test_json_field_type() {
1575        let sdl = r#"
1576            index documents {
1577                field title: text [indexed, stored]
1578                field metadata: json [stored]
1579                field extra: json
1580            }
1581        "#;
1582
1583        let indexes = parse_sdl(sdl).unwrap();
1584        let index = &indexes[0];
1585
1586        assert_eq!(index.fields.len(), 3);
1587
1588        // Check JSON field
1589        assert_eq!(index.fields[1].name, "metadata");
1590        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1591        assert!(index.fields[1].stored);
1592        // JSON fields should not be indexed (enforced by add_json_field)
1593
1594        // Check default attributes for JSON field
1595        assert_eq!(index.fields[2].name, "extra");
1596        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1597
1598        // Verify schema conversion
1599        let schema = index.to_schema();
1600        let metadata_field = schema.get_field("metadata").unwrap();
1601        let entry = schema.get_field_entry(metadata_field).unwrap();
1602        assert_eq!(entry.field_type, FieldType::Json);
1603        assert!(!entry.indexed); // JSON fields are never indexed
1604        assert!(entry.stored);
1605    }
1606
1607    #[test]
1608    fn test_sparse_vector_query_config() {
1609        use crate::structures::QueryWeighting;
1610
1611        let sdl = r#"
1612            index documents {
1613                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1614            }
1615        "#;
1616
1617        let indexes = parse_sdl(sdl).unwrap();
1618        let index = &indexes[0];
1619
1620        assert_eq!(index.fields.len(), 1);
1621        assert_eq!(index.fields[0].name, "embedding");
1622        assert!(matches!(
1623            index.fields[0].field_type,
1624            FieldType::SparseVector
1625        ));
1626
1627        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1628        assert_eq!(config.index_size, IndexSize::U16);
1629        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1630
1631        // Check query config
1632        let query_config = config.query_config.as_ref().unwrap();
1633        assert_eq!(
1634            query_config.tokenizer.as_deref(),
1635            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1636        );
1637        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1638
1639        // Verify schema conversion preserves query config
1640        let schema = index.to_schema();
1641        let embedding_field = schema.get_field("embedding").unwrap();
1642        let entry = schema.get_field_entry(embedding_field).unwrap();
1643        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1644        let qc = sv_config.query_config.as_ref().unwrap();
1645        assert_eq!(
1646            qc.tokenizer.as_deref(),
1647            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1648        );
1649        assert_eq!(qc.weighting, QueryWeighting::Idf);
1650    }
1651
1652    #[test]
1653    fn test_sparse_vector_query_config_weighting_one() {
1654        use crate::structures::QueryWeighting;
1655
1656        let sdl = r#"
1657            index documents {
1658                field embedding: sparse_vector [indexed<query<weighting: one>>]
1659            }
1660        "#;
1661
1662        let indexes = parse_sdl(sdl).unwrap();
1663        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1664
1665        let query_config = config.query_config.as_ref().unwrap();
1666        assert!(query_config.tokenizer.is_none());
1667        assert_eq!(query_config.weighting, QueryWeighting::One);
1668    }
1669
1670    #[test]
1671    fn test_sparse_vector_query_config_weighting_idf_file() {
1672        use crate::structures::QueryWeighting;
1673
1674        let sdl = r#"
1675            index documents {
1676                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1677            }
1678        "#;
1679
1680        let indexes = parse_sdl(sdl).unwrap();
1681        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1682
1683        let query_config = config.query_config.as_ref().unwrap();
1684        assert_eq!(
1685            query_config.tokenizer.as_deref(),
1686            Some("opensearch-neural-sparse-encoding-v1")
1687        );
1688        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1689
1690        // Verify schema conversion preserves idf_file
1691        let schema = indexes[0].to_schema();
1692        let field = schema.get_field("embedding").unwrap();
1693        let entry = schema.get_field_entry(field).unwrap();
1694        let sc = entry.sparse_vector_config.as_ref().unwrap();
1695        let qc = sc.query_config.as_ref().unwrap();
1696        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1697    }
1698
1699    #[test]
1700    fn test_sparse_vector_query_config_pruning_params() {
1701        let sdl = r#"
1702            index documents {
1703                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1704            }
1705        "#;
1706
1707        let indexes = parse_sdl(sdl).unwrap();
1708        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1709
1710        let qc = config.query_config.as_ref().unwrap();
1711        assert_eq!(qc.weighting, QueryWeighting::Idf);
1712        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1713        assert_eq!(qc.max_query_dims, Some(25));
1714        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1715
1716        // Verify schema roundtrip
1717        let schema = indexes[0].to_schema();
1718        let field = schema.get_field("embedding").unwrap();
1719        let entry = schema.get_field_entry(field).unwrap();
1720        let sc = entry.sparse_vector_config.as_ref().unwrap();
1721        let rqc = sc.query_config.as_ref().unwrap();
1722        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1723        assert_eq!(rqc.max_query_dims, Some(25));
1724        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1725    }
1726
1727    #[test]
1728    fn test_fast_attribute() {
1729        let sdl = r#"
1730            index products {
1731                field name: text [indexed, stored]
1732                field price: f64 [indexed, fast]
1733                field category: text [indexed, stored, fast]
1734                field count: u64 [fast]
1735                field score: i64 [indexed, stored, fast]
1736            }
1737        "#;
1738
1739        let indexes = parse_sdl(sdl).unwrap();
1740        assert_eq!(indexes.len(), 1);
1741        let index = &indexes[0];
1742        assert_eq!(index.fields.len(), 5);
1743
1744        // name: no fast
1745        assert!(!index.fields[0].fast);
1746        // price: fast
1747        assert!(index.fields[1].fast);
1748        assert!(matches!(index.fields[1].field_type, FieldType::F64));
1749        // category: fast text
1750        assert!(index.fields[2].fast);
1751        assert!(matches!(index.fields[2].field_type, FieldType::Text));
1752        // count: fast only
1753        assert!(index.fields[3].fast);
1754        assert!(matches!(index.fields[3].field_type, FieldType::U64));
1755        // score: fast i64
1756        assert!(index.fields[4].fast);
1757        assert!(matches!(index.fields[4].field_type, FieldType::I64));
1758
1759        // Verify schema roundtrip preserves fast flag
1760        let schema = index.to_schema();
1761        let price_field = schema.get_field("price").unwrap();
1762        assert!(schema.get_field_entry(price_field).unwrap().fast);
1763
1764        let category_field = schema.get_field("category").unwrap();
1765        assert!(schema.get_field_entry(category_field).unwrap().fast);
1766
1767        let name_field = schema.get_field("name").unwrap();
1768        assert!(!schema.get_field_entry(name_field).unwrap().fast);
1769    }
1770}