Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62/// Parsed field definition
63#[derive(Debug, Clone)]
64pub struct FieldDef {
65    pub name: String,
66    pub field_type: FieldType,
67    pub indexed: bool,
68    pub stored: bool,
69    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
70    pub tokenizer: Option<String>,
71    /// Whether this field can have multiple values (serialized as array in JSON)
72    pub multi: bool,
73    /// Position tracking mode for phrase queries and multi-field element tracking
74    pub positions: Option<super::schema::PositionMode>,
75    /// Configuration for sparse vector fields
76    pub sparse_vector_config: Option<SparseVectorConfig>,
77    /// Configuration for dense vector fields
78    pub dense_vector_config: Option<DenseVectorConfig>,
79}
80
81/// Parsed index definition
82#[derive(Debug, Clone)]
83pub struct IndexDef {
84    pub name: String,
85    pub fields: Vec<FieldDef>,
86    pub default_fields: Vec<String>,
87    /// Query router rules for routing queries to specific fields
88    pub query_routers: Vec<QueryRouterRule>,
89}
90
91impl IndexDef {
92    /// Convert to a Schema
93    pub fn to_schema(&self) -> Schema {
94        let mut builder = SchemaBuilder::default();
95
96        for field in &self.fields {
97            let f = match field.field_type {
98                FieldType::Text => {
99                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
100                    builder.add_text_field_with_tokenizer(
101                        &field.name,
102                        field.indexed,
103                        field.stored,
104                        tokenizer,
105                    )
106                }
107                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
108                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
109                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
110                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
111                FieldType::Json => builder.add_json_field(&field.name, field.stored),
112                FieldType::SparseVector => {
113                    if let Some(config) = &field.sparse_vector_config {
114                        builder.add_sparse_vector_field_with_config(
115                            &field.name,
116                            field.indexed,
117                            field.stored,
118                            config.clone(),
119                        )
120                    } else {
121                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
122                    }
123                }
124                FieldType::DenseVector => {
125                    // Dense vector dimension must be specified via config
126                    let config = field
127                        .dense_vector_config
128                        .as_ref()
129                        .expect("DenseVector field requires dimension to be specified");
130                    builder.add_dense_vector_field_with_config(
131                        &field.name,
132                        field.indexed,
133                        field.stored,
134                        config.clone(),
135                    )
136                }
137            };
138            if field.multi {
139                builder.set_multi(f, true);
140            }
141            // Set positions: explicit > auto (ordinal for multi vectors)
142            let positions = field.positions.or({
143                // Auto-set ordinal positions for multi-valued vector fields
144                if field.multi
145                    && matches!(
146                        field.field_type,
147                        FieldType::SparseVector | FieldType::DenseVector
148                    )
149                {
150                    Some(super::schema::PositionMode::Ordinal)
151                } else {
152                    None
153                }
154            });
155            if let Some(mode) = positions {
156                builder.set_positions(f, mode);
157            }
158        }
159
160        // Set default fields if specified
161        if !self.default_fields.is_empty() {
162            builder.set_default_fields(self.default_fields.clone());
163        }
164
165        // Set query routers if specified
166        if !self.query_routers.is_empty() {
167            builder.set_query_routers(self.query_routers.clone());
168        }
169
170        builder.build()
171    }
172
173    /// Create a QueryFieldRouter from the query router rules
174    ///
175    /// Returns None if there are no query router rules defined.
176    /// Returns Err if any regex pattern is invalid.
177    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
178        if self.query_routers.is_empty() {
179            return Ok(None);
180        }
181
182        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
183            .map(Some)
184            .map_err(Error::Schema)
185    }
186}
187
188/// Parse field type from string
189fn parse_field_type(type_str: &str) -> Result<FieldType> {
190    match type_str {
191        "text" | "string" | "str" => Ok(FieldType::Text),
192        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
193        "i64" | "int" | "integer" => Ok(FieldType::I64),
194        "f64" | "float" | "double" => Ok(FieldType::F64),
195        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
196        "json" => Ok(FieldType::Json),
197        "sparse_vector" => Ok(FieldType::SparseVector),
198        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
199        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
200    }
201}
202
203/// Index configuration parsed from indexed<...> attribute
204#[derive(Debug, Clone, Default)]
205struct IndexConfig {
206    index_type: Option<super::schema::VectorIndexType>,
207    num_clusters: Option<usize>,
208    nprobe: Option<usize>,
209    build_threshold: Option<usize>,
210    // Sparse vector index params
211    quantization: Option<WeightQuantization>,
212    weight_threshold: Option<f32>,
213    block_size: Option<usize>,
214    posting_list_pruning: Option<f32>,
215    // Sparse vector query-time config
216    query_tokenizer: Option<String>,
217    query_weighting: Option<QueryWeighting>,
218    // Position tracking mode for phrase queries
219    positions: Option<super::schema::PositionMode>,
220}
221
222/// Parse attributes from pest pair
223/// Returns (indexed, stored, multi, index_config)
224/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
225/// multi is now inside stored<multi>
226fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
227    let mut indexed = false;
228    let mut stored = false;
229    let mut multi = false;
230    let mut index_config = None;
231
232    for attr in pair.into_inner() {
233        if attr.as_rule() == Rule::attribute {
234            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
235            let mut found_config = false;
236            for inner in attr.clone().into_inner() {
237                match inner.as_rule() {
238                    Rule::indexed_with_config => {
239                        indexed = true;
240                        index_config = Some(parse_index_config(inner));
241                        found_config = true;
242                        break;
243                    }
244                    Rule::stored_with_config => {
245                        stored = true;
246                        multi = true; // stored<multi>
247                        found_config = true;
248                        break;
249                    }
250                    _ => {}
251                }
252            }
253            if !found_config {
254                // Simple attribute
255                match attr.as_str() {
256                    "indexed" => indexed = true,
257                    "stored" => stored = true,
258                    _ => {}
259                }
260            }
261        }
262    }
263
264    (indexed, stored, multi, index_config)
265}
266
267/// Parse index configuration from indexed<...> attribute
268fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
269    let mut config = IndexConfig::default();
270
271    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
272    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
273    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
274
275    for inner in pair.into_inner() {
276        if inner.as_rule() == Rule::index_config_params {
277            for param in inner.into_inner() {
278                if param.as_rule() == Rule::index_config_param {
279                    for p in param.into_inner() {
280                        parse_single_index_config_param(&mut config, p);
281                    }
282                }
283            }
284        }
285    }
286
287    config
288}
289
290/// Parse a single index config parameter
291fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
292    use super::schema::VectorIndexType;
293
294    match p.as_rule() {
295        Rule::index_type_spec => {
296            config.index_type = Some(match p.as_str() {
297                "flat" => VectorIndexType::Flat,
298                "rabitq" => VectorIndexType::RaBitQ,
299                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
300                "scann" => VectorIndexType::ScaNN,
301                _ => VectorIndexType::RaBitQ,
302            });
303        }
304        Rule::index_type_kwarg => {
305            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
306            if let Some(t) = p.into_inner().next() {
307                config.index_type = Some(match t.as_str() {
308                    "flat" => VectorIndexType::Flat,
309                    "rabitq" => VectorIndexType::RaBitQ,
310                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
311                    "scann" => VectorIndexType::ScaNN,
312                    _ => VectorIndexType::RaBitQ,
313                });
314            }
315        }
316        Rule::num_clusters_kwarg => {
317            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
318            if let Some(n) = p.into_inner().next() {
319                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
320            }
321        }
322        Rule::build_threshold_kwarg => {
323            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
324            if let Some(n) = p.into_inner().next() {
325                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
326            }
327        }
328        Rule::nprobe_kwarg => {
329            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
330            if let Some(n) = p.into_inner().next() {
331                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
332            }
333        }
334        Rule::quantization_kwarg => {
335            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
336            if let Some(q) = p.into_inner().next() {
337                config.quantization = Some(match q.as_str() {
338                    "float32" | "f32" => WeightQuantization::Float32,
339                    "float16" | "f16" => WeightQuantization::Float16,
340                    "uint8" | "u8" => WeightQuantization::UInt8,
341                    "uint4" | "u4" => WeightQuantization::UInt4,
342                    _ => WeightQuantization::default(),
343                });
344            }
345        }
346        Rule::weight_threshold_kwarg => {
347            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
348            if let Some(t) = p.into_inner().next() {
349                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
350            }
351        }
352        Rule::block_size_kwarg => {
353            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
354            if let Some(n) = p.into_inner().next() {
355                config.block_size = Some(n.as_str().parse().unwrap_or(128));
356            }
357        }
358        Rule::pruning_kwarg => {
359            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
360            if let Some(f) = p.into_inner().next() {
361                config.posting_list_pruning = Some(f.as_str().parse().unwrap_or(1.0));
362            }
363        }
364        Rule::query_config_block => {
365            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
366            parse_query_config_block(config, p);
367        }
368        Rule::positions_kwarg => {
369            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
370            use super::schema::PositionMode;
371            config.positions = Some(match p.as_str() {
372                "ordinal" => PositionMode::Ordinal,
373                "token_position" => PositionMode::TokenPosition,
374                _ => PositionMode::Full, // "positions" or any other value defaults to Full
375            });
376        }
377        _ => {}
378    }
379}
380
381/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
382fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
383    for inner in pair.into_inner() {
384        if inner.as_rule() == Rule::query_config_params {
385            for param in inner.into_inner() {
386                if param.as_rule() == Rule::query_config_param {
387                    for p in param.into_inner() {
388                        match p.as_rule() {
389                            Rule::query_tokenizer_kwarg => {
390                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
391                                if let Some(path) = p.into_inner().next()
392                                    && let Some(inner_path) = path.into_inner().next()
393                                {
394                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
395                                }
396                            }
397                            Rule::query_weighting_kwarg => {
398                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
399                                if let Some(w) = p.into_inner().next() {
400                                    config.query_weighting = Some(match w.as_str() {
401                                        "one" => QueryWeighting::One,
402                                        "idf" => QueryWeighting::Idf,
403                                        "idf_file" => QueryWeighting::IdfFile,
404                                        _ => QueryWeighting::One,
405                                    });
406                                }
407                            }
408                            _ => {}
409                        }
410                    }
411                }
412            }
413        }
414    }
415}
416
417/// Parse a field definition from pest pair
418fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
419    let mut inner = pair.into_inner();
420
421    let name = inner
422        .next()
423        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
424        .as_str()
425        .to_string();
426
427    let field_type_str = inner
428        .next()
429        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
430        .as_str();
431
432    let field_type = parse_field_type(field_type_str)?;
433
434    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
435    let mut tokenizer = None;
436    let mut sparse_vector_config = None;
437    let mut dense_vector_config = None;
438    let mut indexed = true;
439    let mut stored = true;
440    let mut multi = false;
441    let mut index_config: Option<IndexConfig> = None;
442
443    for item in inner {
444        match item.as_rule() {
445            Rule::tokenizer_spec => {
446                // Extract tokenizer name from <name>
447                if let Some(tok_name) = item.into_inner().next() {
448                    tokenizer = Some(tok_name.as_str().to_string());
449                }
450            }
451            Rule::sparse_vector_config => {
452                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
453                sparse_vector_config = Some(parse_sparse_vector_config(item));
454            }
455            Rule::dense_vector_config => {
456                // Parse dense_vector_params (keyword or positional) - only dims
457                dense_vector_config = Some(parse_dense_vector_config(item));
458            }
459            Rule::attributes => {
460                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
461                indexed = idx;
462                stored = sto;
463                multi = mul;
464                index_config = idx_cfg;
465            }
466            _ => {}
467        }
468    }
469
470    // Merge index config into vector configs if both exist
471    let mut positions = None;
472    if let Some(idx_cfg) = index_config {
473        positions = idx_cfg.positions;
474        if let Some(ref mut dv_config) = dense_vector_config {
475            apply_index_config_to_dense_vector(dv_config, idx_cfg);
476        } else if field_type == FieldType::SparseVector {
477            // For sparse vectors, create default config if not present and apply index params
478            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
479            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
480        }
481    }
482
483    Ok(FieldDef {
484        name,
485        field_type,
486        indexed,
487        stored,
488        tokenizer,
489        multi,
490        positions,
491        sparse_vector_config,
492        dense_vector_config,
493    })
494}
495
496/// Apply index configuration from indexed<...> to DenseVectorConfig
497fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
498    // Apply index type if specified
499    if let Some(index_type) = idx_cfg.index_type {
500        config.index_type = index_type;
501    }
502
503    // Apply num_clusters for IVF-based indexes
504    if idx_cfg.num_clusters.is_some() {
505        config.num_clusters = idx_cfg.num_clusters;
506    }
507
508    // Apply nprobe if specified
509    if let Some(nprobe) = idx_cfg.nprobe {
510        config.nprobe = nprobe;
511    }
512
513    // Apply build_threshold if specified
514    if idx_cfg.build_threshold.is_some() {
515        config.build_threshold = idx_cfg.build_threshold;
516    }
517}
518
519/// Parse sparse_vector_config - only index_size (positional)
520/// Example: <u16> or <u32>
521fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
522    let mut index_size = IndexSize::default();
523
524    // Parse positional index_size_spec
525    for inner in pair.into_inner() {
526        if inner.as_rule() == Rule::index_size_spec {
527            index_size = match inner.as_str() {
528                "u16" => IndexSize::U16,
529                "u32" => IndexSize::U32,
530                _ => IndexSize::default(),
531            };
532        }
533    }
534
535    SparseVectorConfig {
536        index_size,
537        weight_quantization: WeightQuantization::default(),
538        weight_threshold: 0.0,
539        block_size: 128,
540        posting_list_pruning: None,
541        query_config: None,
542    }
543}
544
545/// Apply index configuration from indexed<...> to SparseVectorConfig
546fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
547    if let Some(q) = idx_cfg.quantization {
548        config.weight_quantization = q;
549    }
550    if let Some(t) = idx_cfg.weight_threshold {
551        config.weight_threshold = t;
552    }
553    if let Some(bs) = idx_cfg.block_size {
554        let adjusted = bs.next_power_of_two();
555        if adjusted != bs {
556            log::warn!(
557                "block_size {} adjusted to next power of two: {}",
558                bs,
559                adjusted
560            );
561        }
562        config.block_size = adjusted;
563    }
564    if let Some(p) = idx_cfg.posting_list_pruning {
565        let clamped = p.clamp(0.0, 1.0);
566        if (clamped - p).abs() > f32::EPSILON {
567            log::warn!(
568                "pruning {} clamped to valid range [0.0, 1.0]: {}",
569                p,
570                clamped
571            );
572        }
573        config.posting_list_pruning = Some(clamped);
574    }
575    // Apply query-time configuration if present
576    if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
577        let query_config = config
578            .query_config
579            .get_or_insert(SparseQueryConfig::default());
580        if let Some(tokenizer) = idx_cfg.query_tokenizer {
581            query_config.tokenizer = Some(tokenizer);
582        }
583        if let Some(weighting) = idx_cfg.query_weighting {
584            query_config.weighting = weighting;
585        }
586    }
587}
588
589/// Parse dense_vector_config - dims and optional quantization type
590/// All index-related params are in indexed<...> attribute
591fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
592    let mut dim: usize = 0;
593    let mut quantization = DenseVectorQuantization::F32;
594
595    // Navigate to dense_vector_params
596    for params in pair.into_inner() {
597        if params.as_rule() == Rule::dense_vector_params {
598            for inner in params.into_inner() {
599                match inner.as_rule() {
600                    Rule::dense_vector_keyword_params => {
601                        for kwarg in inner.into_inner() {
602                            match kwarg.as_rule() {
603                                Rule::dims_kwarg => {
604                                    if let Some(d) = kwarg.into_inner().next() {
605                                        dim = d.as_str().parse().unwrap_or(0);
606                                    }
607                                }
608                                Rule::quant_type_spec => {
609                                    quantization = parse_quant_type(kwarg.as_str());
610                                }
611                                _ => {}
612                            }
613                        }
614                    }
615                    Rule::dense_vector_positional_params => {
616                        for item in inner.into_inner() {
617                            match item.as_rule() {
618                                Rule::dimension_spec => {
619                                    dim = item.as_str().parse().unwrap_or(0);
620                                }
621                                Rule::quant_type_spec => {
622                                    quantization = parse_quant_type(item.as_str());
623                                }
624                                _ => {}
625                            }
626                        }
627                    }
628                    _ => {}
629                }
630            }
631        }
632    }
633
634    DenseVectorConfig::new(dim).with_quantization(quantization)
635}
636
637fn parse_quant_type(s: &str) -> DenseVectorQuantization {
638    match s.trim() {
639        "f16" => DenseVectorQuantization::F16,
640        "uint8" | "u8" => DenseVectorQuantization::UInt8,
641        _ => DenseVectorQuantization::F32,
642    }
643}
644
645/// Parse default_fields definition
646fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
647    pair.into_inner().map(|p| p.as_str().to_string()).collect()
648}
649
650/// Parse a query router definition
651fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
652    let mut pattern = String::new();
653    let mut substitution = String::new();
654    let mut target_field = String::new();
655    let mut mode = RoutingMode::Additional;
656
657    for prop in pair.into_inner() {
658        if prop.as_rule() != Rule::query_router_prop {
659            continue;
660        }
661
662        for inner in prop.into_inner() {
663            match inner.as_rule() {
664                Rule::query_router_pattern => {
665                    if let Some(regex_str) = inner.into_inner().next() {
666                        pattern = parse_string_value(regex_str);
667                    }
668                }
669                Rule::query_router_substitution => {
670                    if let Some(quoted) = inner.into_inner().next() {
671                        substitution = parse_string_value(quoted);
672                    }
673                }
674                Rule::query_router_target => {
675                    if let Some(ident) = inner.into_inner().next() {
676                        target_field = ident.as_str().to_string();
677                    }
678                }
679                Rule::query_router_mode => {
680                    if let Some(mode_val) = inner.into_inner().next() {
681                        mode = match mode_val.as_str() {
682                            "exclusive" => RoutingMode::Exclusive,
683                            "additional" => RoutingMode::Additional,
684                            _ => RoutingMode::Additional,
685                        };
686                    }
687                }
688                _ => {}
689            }
690        }
691    }
692
693    if pattern.is_empty() {
694        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
695    }
696    if substitution.is_empty() {
697        return Err(Error::Schema(
698            "query_router missing 'substitution'".to_string(),
699        ));
700    }
701    if target_field.is_empty() {
702        return Err(Error::Schema(
703            "query_router missing 'target_field'".to_string(),
704        ));
705    }
706
707    Ok(QueryRouterRule {
708        pattern,
709        substitution,
710        target_field,
711        mode,
712    })
713}
714
715/// Parse a string value from quoted_string, raw_string, or regex_string
716fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
717    let s = pair.as_str();
718    match pair.as_rule() {
719        Rule::regex_string => {
720            // regex_string contains either raw_string or quoted_string
721            if let Some(inner) = pair.into_inner().next() {
722                parse_string_value(inner)
723            } else {
724                s.to_string()
725            }
726        }
727        Rule::raw_string => {
728            // r"..." - strip r" prefix and " suffix
729            s[2..s.len() - 1].to_string()
730        }
731        Rule::quoted_string => {
732            // "..." - strip quotes and handle escapes
733            let inner = &s[1..s.len() - 1];
734            // Simple escape handling
735            inner
736                .replace("\\n", "\n")
737                .replace("\\t", "\t")
738                .replace("\\\"", "\"")
739                .replace("\\\\", "\\")
740        }
741        _ => s.to_string(),
742    }
743}
744
745/// Parse an index definition from pest pair
746fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
747    let mut inner = pair.into_inner();
748
749    let name = inner
750        .next()
751        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
752        .as_str()
753        .to_string();
754
755    let mut fields = Vec::new();
756    let mut default_fields = Vec::new();
757    let mut query_routers = Vec::new();
758
759    for item in inner {
760        match item.as_rule() {
761            Rule::field_def => {
762                fields.push(parse_field_def(item)?);
763            }
764            Rule::default_fields_def => {
765                default_fields = parse_default_fields_def(item);
766            }
767            Rule::query_router_def => {
768                query_routers.push(parse_query_router_def(item)?);
769            }
770            _ => {}
771        }
772    }
773
774    Ok(IndexDef {
775        name,
776        fields,
777        default_fields,
778        query_routers,
779    })
780}
781
782/// Parse SDL from a string
783pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
784    let pairs = SdlParser::parse(Rule::file, input)
785        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
786
787    let mut indexes = Vec::new();
788
789    for pair in pairs {
790        if pair.as_rule() == Rule::file {
791            for inner in pair.into_inner() {
792                if inner.as_rule() == Rule::index_def {
793                    indexes.push(parse_index_def(inner)?);
794                }
795            }
796        }
797    }
798
799    Ok(indexes)
800}
801
802/// Parse SDL and return a single index definition
803pub fn parse_single_index(input: &str) -> Result<IndexDef> {
804    let indexes = parse_sdl(input)?;
805
806    if indexes.is_empty() {
807        return Err(Error::Schema("No index definition found".to_string()));
808    }
809
810    if indexes.len() > 1 {
811        return Err(Error::Schema(
812            "Multiple index definitions found, expected one".to_string(),
813        ));
814    }
815
816    Ok(indexes.into_iter().next().unwrap())
817}
818
819#[cfg(test)]
820mod tests {
821    use super::*;
822
823    #[test]
824    fn test_parse_simple_schema() {
825        let sdl = r#"
826            index articles {
827                field title: text [indexed, stored]
828                field body: text [indexed]
829            }
830        "#;
831
832        let indexes = parse_sdl(sdl).unwrap();
833        assert_eq!(indexes.len(), 1);
834
835        let index = &indexes[0];
836        assert_eq!(index.name, "articles");
837        assert_eq!(index.fields.len(), 2);
838
839        assert_eq!(index.fields[0].name, "title");
840        assert!(matches!(index.fields[0].field_type, FieldType::Text));
841        assert!(index.fields[0].indexed);
842        assert!(index.fields[0].stored);
843
844        assert_eq!(index.fields[1].name, "body");
845        assert!(matches!(index.fields[1].field_type, FieldType::Text));
846        assert!(index.fields[1].indexed);
847        assert!(!index.fields[1].stored);
848    }
849
850    #[test]
851    fn test_parse_all_field_types() {
852        let sdl = r#"
853            index test {
854                field text_field: text [indexed, stored]
855                field u64_field: u64 [indexed, stored]
856                field i64_field: i64 [indexed, stored]
857                field f64_field: f64 [indexed, stored]
858                field bytes_field: bytes [stored]
859            }
860        "#;
861
862        let indexes = parse_sdl(sdl).unwrap();
863        let index = &indexes[0];
864
865        assert!(matches!(index.fields[0].field_type, FieldType::Text));
866        assert!(matches!(index.fields[1].field_type, FieldType::U64));
867        assert!(matches!(index.fields[2].field_type, FieldType::I64));
868        assert!(matches!(index.fields[3].field_type, FieldType::F64));
869        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
870    }
871
872    #[test]
873    fn test_parse_with_comments() {
874        let sdl = r#"
875            # This is a comment
876            index articles {
877                # Title field
878                field title: text [indexed, stored]
879                field body: text [indexed] # inline comment not supported yet
880            }
881        "#;
882
883        let indexes = parse_sdl(sdl).unwrap();
884        assert_eq!(indexes[0].fields.len(), 2);
885    }
886
887    #[test]
888    fn test_parse_type_aliases() {
889        let sdl = r#"
890            index test {
891                field a: string [indexed]
892                field b: int [indexed]
893                field c: uint [indexed]
894                field d: float [indexed]
895                field e: binary [stored]
896            }
897        "#;
898
899        let indexes = parse_sdl(sdl).unwrap();
900        let index = &indexes[0];
901
902        assert!(matches!(index.fields[0].field_type, FieldType::Text));
903        assert!(matches!(index.fields[1].field_type, FieldType::I64));
904        assert!(matches!(index.fields[2].field_type, FieldType::U64));
905        assert!(matches!(index.fields[3].field_type, FieldType::F64));
906        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
907    }
908
909    #[test]
910    fn test_to_schema() {
911        let sdl = r#"
912            index articles {
913                field title: text [indexed, stored]
914                field views: u64 [indexed, stored]
915            }
916        "#;
917
918        let indexes = parse_sdl(sdl).unwrap();
919        let schema = indexes[0].to_schema();
920
921        assert!(schema.get_field("title").is_some());
922        assert!(schema.get_field("views").is_some());
923        assert!(schema.get_field("nonexistent").is_none());
924    }
925
926    #[test]
927    fn test_default_attributes() {
928        let sdl = r#"
929            index test {
930                field title: text
931            }
932        "#;
933
934        let indexes = parse_sdl(sdl).unwrap();
935        let field = &indexes[0].fields[0];
936
937        // Default should be indexed and stored
938        assert!(field.indexed);
939        assert!(field.stored);
940    }
941
942    #[test]
943    fn test_multiple_indexes() {
944        let sdl = r#"
945            index articles {
946                field title: text [indexed, stored]
947            }
948
949            index users {
950                field name: text [indexed, stored]
951                field email: text [indexed, stored]
952            }
953        "#;
954
955        let indexes = parse_sdl(sdl).unwrap();
956        assert_eq!(indexes.len(), 2);
957        assert_eq!(indexes[0].name, "articles");
958        assert_eq!(indexes[1].name, "users");
959    }
960
961    #[test]
962    fn test_tokenizer_spec() {
963        let sdl = r#"
964            index articles {
965                field title: text<en_stem> [indexed, stored]
966                field body: text<default> [indexed]
967                field author: text [indexed, stored]
968            }
969        "#;
970
971        let indexes = parse_sdl(sdl).unwrap();
972        let index = &indexes[0];
973
974        assert_eq!(index.fields[0].name, "title");
975        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
976
977        assert_eq!(index.fields[1].name, "body");
978        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
979
980        assert_eq!(index.fields[2].name, "author");
981        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
982    }
983
984    #[test]
985    fn test_tokenizer_in_schema() {
986        let sdl = r#"
987            index articles {
988                field title: text<german> [indexed, stored]
989                field body: text<en_stem> [indexed]
990            }
991        "#;
992
993        let indexes = parse_sdl(sdl).unwrap();
994        let schema = indexes[0].to_schema();
995
996        let title_field = schema.get_field("title").unwrap();
997        let title_entry = schema.get_field_entry(title_field).unwrap();
998        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
999
1000        let body_field = schema.get_field("body").unwrap();
1001        let body_entry = schema.get_field_entry(body_field).unwrap();
1002        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1003    }
1004
1005    #[test]
1006    fn test_query_router_basic() {
1007        let sdl = r#"
1008            index documents {
1009                field title: text [indexed, stored]
1010                field uri: text [indexed, stored]
1011
1012                query_router {
1013                    pattern: "10\\.\\d{4,}/[^\\s]+"
1014                    substitution: "doi://{0}"
1015                    target_field: uris
1016                    mode: exclusive
1017                }
1018            }
1019        "#;
1020
1021        let indexes = parse_sdl(sdl).unwrap();
1022        let index = &indexes[0];
1023
1024        assert_eq!(index.query_routers.len(), 1);
1025        let router = &index.query_routers[0];
1026        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1027        assert_eq!(router.substitution, "doi://{0}");
1028        assert_eq!(router.target_field, "uris");
1029        assert_eq!(router.mode, RoutingMode::Exclusive);
1030    }
1031
1032    #[test]
1033    fn test_query_router_raw_string() {
1034        let sdl = r#"
1035            index documents {
1036                field uris: text [indexed, stored]
1037
1038                query_router {
1039                    pattern: r"^pmid:(\d+)$"
1040                    substitution: "pubmed://{1}"
1041                    target_field: uris
1042                    mode: additional
1043                }
1044            }
1045        "#;
1046
1047        let indexes = parse_sdl(sdl).unwrap();
1048        let router = &indexes[0].query_routers[0];
1049
1050        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1051        assert_eq!(router.substitution, "pubmed://{1}");
1052        assert_eq!(router.mode, RoutingMode::Additional);
1053    }
1054
1055    #[test]
1056    fn test_multiple_query_routers() {
1057        let sdl = r#"
1058            index documents {
1059                field uris: text [indexed, stored]
1060
1061                query_router {
1062                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1063                    substitution: "doi://{1}"
1064                    target_field: uris
1065                    mode: exclusive
1066                }
1067
1068                query_router {
1069                    pattern: r"^pmid:(\d+)$"
1070                    substitution: "pubmed://{1}"
1071                    target_field: uris
1072                    mode: exclusive
1073                }
1074
1075                query_router {
1076                    pattern: r"^arxiv:(\d+\.\d+)$"
1077                    substitution: "arxiv://{1}"
1078                    target_field: uris
1079                    mode: additional
1080                }
1081            }
1082        "#;
1083
1084        let indexes = parse_sdl(sdl).unwrap();
1085        assert_eq!(indexes[0].query_routers.len(), 3);
1086    }
1087
1088    #[test]
1089    fn test_query_router_default_mode() {
1090        let sdl = r#"
1091            index documents {
1092                field uris: text [indexed, stored]
1093
1094                query_router {
1095                    pattern: r"test"
1096                    substitution: "{0}"
1097                    target_field: uris
1098                }
1099            }
1100        "#;
1101
1102        let indexes = parse_sdl(sdl).unwrap();
1103        // Default mode should be Additional
1104        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1105    }
1106
1107    #[test]
1108    fn test_multi_attribute() {
1109        let sdl = r#"
1110            index documents {
1111                field uris: text [indexed, stored<multi>]
1112                field title: text [indexed, stored]
1113            }
1114        "#;
1115
1116        let indexes = parse_sdl(sdl).unwrap();
1117        assert_eq!(indexes.len(), 1);
1118
1119        let fields = &indexes[0].fields;
1120        assert_eq!(fields.len(), 2);
1121
1122        // uris should have multi=true
1123        assert_eq!(fields[0].name, "uris");
1124        assert!(fields[0].multi, "uris field should have multi=true");
1125
1126        // title should have multi=false
1127        assert_eq!(fields[1].name, "title");
1128        assert!(!fields[1].multi, "title field should have multi=false");
1129
1130        // Verify schema conversion preserves multi attribute
1131        let schema = indexes[0].to_schema();
1132        let uris_field = schema.get_field("uris").unwrap();
1133        let title_field = schema.get_field("title").unwrap();
1134
1135        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1136        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1137    }
1138
1139    #[test]
1140    fn test_sparse_vector_field() {
1141        let sdl = r#"
1142            index documents {
1143                field embedding: sparse_vector [indexed, stored]
1144            }
1145        "#;
1146
1147        let indexes = parse_sdl(sdl).unwrap();
1148        assert_eq!(indexes.len(), 1);
1149        assert_eq!(indexes[0].fields.len(), 1);
1150        assert_eq!(indexes[0].fields[0].name, "embedding");
1151        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1152        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1153    }
1154
1155    #[test]
1156    fn test_sparse_vector_with_config() {
1157        let sdl = r#"
1158            index documents {
1159                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1160                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1161            }
1162        "#;
1163
1164        let indexes = parse_sdl(sdl).unwrap();
1165        assert_eq!(indexes[0].fields.len(), 2);
1166
1167        // First field: u16 indices, uint8 quantization
1168        let f1 = &indexes[0].fields[0];
1169        assert_eq!(f1.name, "embedding");
1170        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1171        assert_eq!(config1.index_size, IndexSize::U16);
1172        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1173
1174        // Second field: u32 indices, float32 quantization
1175        let f2 = &indexes[0].fields[1];
1176        assert_eq!(f2.name, "dense");
1177        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1178        assert_eq!(config2.index_size, IndexSize::U32);
1179        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1180    }
1181
1182    #[test]
1183    fn test_sparse_vector_with_weight_threshold() {
1184        let sdl = r#"
1185            index documents {
1186                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1187                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1188            }
1189        "#;
1190
1191        let indexes = parse_sdl(sdl).unwrap();
1192        assert_eq!(indexes[0].fields.len(), 2);
1193
1194        // First field: u16 indices, uint8 quantization, threshold 0.1
1195        let f1 = &indexes[0].fields[0];
1196        assert_eq!(f1.name, "embedding");
1197        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1198        assert_eq!(config1.index_size, IndexSize::U16);
1199        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1200        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1201
1202        // Second field: u32 indices, float16 quantization, threshold 0.05
1203        let f2 = &indexes[0].fields[1];
1204        assert_eq!(f2.name, "embedding2");
1205        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1206        assert_eq!(config2.index_size, IndexSize::U32);
1207        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1208        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1209    }
1210
1211    #[test]
1212    fn test_sparse_vector_with_pruning() {
1213        let sdl = r#"
1214            index documents {
1215                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1216            }
1217        "#;
1218
1219        let indexes = parse_sdl(sdl).unwrap();
1220        let f = &indexes[0].fields[0];
1221        assert_eq!(f.name, "embedding");
1222        let config = f.sparse_vector_config.as_ref().unwrap();
1223        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1224        assert_eq!(config.posting_list_pruning, Some(0.1));
1225    }
1226
1227    #[test]
1228    fn test_dense_vector_field() {
1229        let sdl = r#"
1230            index documents {
1231                field embedding: dense_vector<768> [indexed, stored]
1232            }
1233        "#;
1234
1235        let indexes = parse_sdl(sdl).unwrap();
1236        assert_eq!(indexes.len(), 1);
1237        assert_eq!(indexes[0].fields.len(), 1);
1238
1239        let f = &indexes[0].fields[0];
1240        assert_eq!(f.name, "embedding");
1241        assert_eq!(f.field_type, FieldType::DenseVector);
1242
1243        let config = f.dense_vector_config.as_ref().unwrap();
1244        assert_eq!(config.dim, 768);
1245    }
1246
1247    #[test]
1248    fn test_dense_vector_alias() {
1249        let sdl = r#"
1250            index documents {
1251                field embedding: vector<1536> [indexed]
1252            }
1253        "#;
1254
1255        let indexes = parse_sdl(sdl).unwrap();
1256        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1257        assert_eq!(
1258            indexes[0].fields[0]
1259                .dense_vector_config
1260                .as_ref()
1261                .unwrap()
1262                .dim,
1263            1536
1264        );
1265    }
1266
1267    #[test]
1268    fn test_dense_vector_with_num_clusters() {
1269        let sdl = r#"
1270            index documents {
1271                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1272            }
1273        "#;
1274
1275        let indexes = parse_sdl(sdl).unwrap();
1276        assert_eq!(indexes.len(), 1);
1277
1278        let f = &indexes[0].fields[0];
1279        assert_eq!(f.name, "embedding");
1280        assert_eq!(f.field_type, FieldType::DenseVector);
1281
1282        let config = f.dense_vector_config.as_ref().unwrap();
1283        assert_eq!(config.dim, 768);
1284        assert_eq!(config.num_clusters, Some(256));
1285        assert_eq!(config.nprobe, 32); // default
1286    }
1287
1288    #[test]
1289    fn test_dense_vector_with_num_clusters_and_nprobe() {
1290        let sdl = r#"
1291            index documents {
1292                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1293            }
1294        "#;
1295
1296        let indexes = parse_sdl(sdl).unwrap();
1297        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1298
1299        assert_eq!(config.dim, 1536);
1300        assert_eq!(config.num_clusters, Some(512));
1301        assert_eq!(config.nprobe, 64);
1302    }
1303
1304    #[test]
1305    fn test_dense_vector_keyword_syntax() {
1306        let sdl = r#"
1307            index documents {
1308                field embedding: dense_vector<dims: 1536> [indexed, stored]
1309            }
1310        "#;
1311
1312        let indexes = parse_sdl(sdl).unwrap();
1313        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1314
1315        assert_eq!(config.dim, 1536);
1316        assert!(config.num_clusters.is_none());
1317    }
1318
1319    #[test]
1320    fn test_dense_vector_keyword_syntax_full() {
1321        let sdl = r#"
1322            index documents {
1323                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1324            }
1325        "#;
1326
1327        let indexes = parse_sdl(sdl).unwrap();
1328        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1329
1330        assert_eq!(config.dim, 1536);
1331        assert_eq!(config.num_clusters, Some(256));
1332        assert_eq!(config.nprobe, 64);
1333    }
1334
1335    #[test]
1336    fn test_dense_vector_keyword_syntax_partial() {
1337        let sdl = r#"
1338            index documents {
1339                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1340            }
1341        "#;
1342
1343        let indexes = parse_sdl(sdl).unwrap();
1344        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1345
1346        assert_eq!(config.dim, 768);
1347        assert_eq!(config.num_clusters, Some(128));
1348        assert_eq!(config.nprobe, 32); // default
1349    }
1350
1351    #[test]
1352    fn test_dense_vector_scann_index() {
1353        use crate::dsl::schema::VectorIndexType;
1354
1355        let sdl = r#"
1356            index documents {
1357                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1358            }
1359        "#;
1360
1361        let indexes = parse_sdl(sdl).unwrap();
1362        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1363
1364        assert_eq!(config.dim, 768);
1365        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1366        assert_eq!(config.num_clusters, Some(256));
1367        assert_eq!(config.nprobe, 64);
1368    }
1369
1370    #[test]
1371    fn test_dense_vector_ivf_rabitq_index() {
1372        use crate::dsl::schema::VectorIndexType;
1373
1374        let sdl = r#"
1375            index documents {
1376                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1377            }
1378        "#;
1379
1380        let indexes = parse_sdl(sdl).unwrap();
1381        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1382
1383        assert_eq!(config.dim, 1536);
1384        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1385        assert_eq!(config.num_clusters, Some(512));
1386    }
1387
1388    #[test]
1389    fn test_dense_vector_rabitq_no_clusters() {
1390        use crate::dsl::schema::VectorIndexType;
1391
1392        let sdl = r#"
1393            index documents {
1394                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1395            }
1396        "#;
1397
1398        let indexes = parse_sdl(sdl).unwrap();
1399        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1400
1401        assert_eq!(config.dim, 768);
1402        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1403        assert!(config.num_clusters.is_none());
1404    }
1405
1406    #[test]
1407    fn test_dense_vector_flat_index() {
1408        use crate::dsl::schema::VectorIndexType;
1409
1410        let sdl = r#"
1411            index documents {
1412                field embedding: dense_vector<dims: 768> [indexed<flat>]
1413            }
1414        "#;
1415
1416        let indexes = parse_sdl(sdl).unwrap();
1417        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1418
1419        assert_eq!(config.dim, 768);
1420        assert_eq!(config.index_type, VectorIndexType::Flat);
1421    }
1422
1423    #[test]
1424    fn test_dense_vector_default_index_type() {
1425        use crate::dsl::schema::VectorIndexType;
1426
1427        // When no index type specified, should default to RaBitQ (basic)
1428        let sdl = r#"
1429            index documents {
1430                field embedding: dense_vector<dims: 768> [indexed]
1431            }
1432        "#;
1433
1434        let indexes = parse_sdl(sdl).unwrap();
1435        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1436
1437        assert_eq!(config.dim, 768);
1438        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1439    }
1440
1441    #[test]
1442    fn test_dense_vector_f16_quantization() {
1443        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1444
1445        let sdl = r#"
1446            index documents {
1447                field embedding: dense_vector<768, f16> [indexed]
1448            }
1449        "#;
1450
1451        let indexes = parse_sdl(sdl).unwrap();
1452        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1453
1454        assert_eq!(config.dim, 768);
1455        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1456        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1457    }
1458
1459    #[test]
1460    fn test_dense_vector_uint8_quantization() {
1461        use crate::dsl::schema::DenseVectorQuantization;
1462
1463        let sdl = r#"
1464            index documents {
1465                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1466            }
1467        "#;
1468
1469        let indexes = parse_sdl(sdl).unwrap();
1470        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1471
1472        assert_eq!(config.dim, 1024);
1473        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1474    }
1475
1476    #[test]
1477    fn test_dense_vector_u8_alias() {
1478        use crate::dsl::schema::DenseVectorQuantization;
1479
1480        let sdl = r#"
1481            index documents {
1482                field embedding: dense_vector<512, u8> [indexed]
1483            }
1484        "#;
1485
1486        let indexes = parse_sdl(sdl).unwrap();
1487        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1488
1489        assert_eq!(config.dim, 512);
1490        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1491    }
1492
1493    #[test]
1494    fn test_dense_vector_default_f32_quantization() {
1495        use crate::dsl::schema::DenseVectorQuantization;
1496
1497        // No quantization type → default f32
1498        let sdl = r#"
1499            index documents {
1500                field embedding: dense_vector<768> [indexed]
1501            }
1502        "#;
1503
1504        let indexes = parse_sdl(sdl).unwrap();
1505        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1506
1507        assert_eq!(config.dim, 768);
1508        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1509    }
1510
1511    #[test]
1512    fn test_dense_vector_keyword_with_quantization() {
1513        use crate::dsl::schema::DenseVectorQuantization;
1514
1515        let sdl = r#"
1516            index documents {
1517                field embedding: dense_vector<dims: 768, f16> [indexed]
1518            }
1519        "#;
1520
1521        let indexes = parse_sdl(sdl).unwrap();
1522        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1523
1524        assert_eq!(config.dim, 768);
1525        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1526    }
1527
1528    #[test]
1529    fn test_json_field_type() {
1530        let sdl = r#"
1531            index documents {
1532                field title: text [indexed, stored]
1533                field metadata: json [stored]
1534                field extra: json
1535            }
1536        "#;
1537
1538        let indexes = parse_sdl(sdl).unwrap();
1539        let index = &indexes[0];
1540
1541        assert_eq!(index.fields.len(), 3);
1542
1543        // Check JSON field
1544        assert_eq!(index.fields[1].name, "metadata");
1545        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1546        assert!(index.fields[1].stored);
1547        // JSON fields should not be indexed (enforced by add_json_field)
1548
1549        // Check default attributes for JSON field
1550        assert_eq!(index.fields[2].name, "extra");
1551        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1552
1553        // Verify schema conversion
1554        let schema = index.to_schema();
1555        let metadata_field = schema.get_field("metadata").unwrap();
1556        let entry = schema.get_field_entry(metadata_field).unwrap();
1557        assert_eq!(entry.field_type, FieldType::Json);
1558        assert!(!entry.indexed); // JSON fields are never indexed
1559        assert!(entry.stored);
1560    }
1561
1562    #[test]
1563    fn test_sparse_vector_query_config() {
1564        use crate::structures::QueryWeighting;
1565
1566        let sdl = r#"
1567            index documents {
1568                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1569            }
1570        "#;
1571
1572        let indexes = parse_sdl(sdl).unwrap();
1573        let index = &indexes[0];
1574
1575        assert_eq!(index.fields.len(), 1);
1576        assert_eq!(index.fields[0].name, "embedding");
1577        assert!(matches!(
1578            index.fields[0].field_type,
1579            FieldType::SparseVector
1580        ));
1581
1582        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1583        assert_eq!(config.index_size, IndexSize::U16);
1584        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1585
1586        // Check query config
1587        let query_config = config.query_config.as_ref().unwrap();
1588        assert_eq!(
1589            query_config.tokenizer.as_deref(),
1590            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1591        );
1592        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1593
1594        // Verify schema conversion preserves query config
1595        let schema = index.to_schema();
1596        let embedding_field = schema.get_field("embedding").unwrap();
1597        let entry = schema.get_field_entry(embedding_field).unwrap();
1598        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1599        let qc = sv_config.query_config.as_ref().unwrap();
1600        assert_eq!(
1601            qc.tokenizer.as_deref(),
1602            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1603        );
1604        assert_eq!(qc.weighting, QueryWeighting::Idf);
1605    }
1606
1607    #[test]
1608    fn test_sparse_vector_query_config_weighting_one() {
1609        use crate::structures::QueryWeighting;
1610
1611        let sdl = r#"
1612            index documents {
1613                field embedding: sparse_vector [indexed<query<weighting: one>>]
1614            }
1615        "#;
1616
1617        let indexes = parse_sdl(sdl).unwrap();
1618        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1619
1620        let query_config = config.query_config.as_ref().unwrap();
1621        assert!(query_config.tokenizer.is_none());
1622        assert_eq!(query_config.weighting, QueryWeighting::One);
1623    }
1624
1625    #[test]
1626    fn test_sparse_vector_query_config_weighting_idf_file() {
1627        use crate::structures::QueryWeighting;
1628
1629        let sdl = r#"
1630            index documents {
1631                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1632            }
1633        "#;
1634
1635        let indexes = parse_sdl(sdl).unwrap();
1636        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1637
1638        let query_config = config.query_config.as_ref().unwrap();
1639        assert_eq!(
1640            query_config.tokenizer.as_deref(),
1641            Some("opensearch-neural-sparse-encoding-v1")
1642        );
1643        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1644
1645        // Verify schema conversion preserves idf_file
1646        let schema = indexes[0].to_schema();
1647        let field = schema.get_field("embedding").unwrap();
1648        let entry = schema.get_field_entry(field).unwrap();
1649        let sc = entry.sparse_vector_config.as_ref().unwrap();
1650        let qc = sc.query_config.as_ref().unwrap();
1651        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1652    }
1653}