Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62/// Parsed field definition
63#[derive(Debug, Clone)]
64pub struct FieldDef {
65    pub name: String,
66    pub field_type: FieldType,
67    pub indexed: bool,
68    pub stored: bool,
69    /// Tokenizer name for text fields (e.g., "simple", "en_stem", "german")
70    pub tokenizer: Option<String>,
71    /// Whether this field can have multiple values (serialized as array in JSON)
72    pub multi: bool,
73    /// Position tracking mode for phrase queries and multi-field element tracking
74    pub positions: Option<super::schema::PositionMode>,
75    /// Configuration for sparse vector fields
76    pub sparse_vector_config: Option<SparseVectorConfig>,
77    /// Configuration for dense vector fields
78    pub dense_vector_config: Option<DenseVectorConfig>,
79    /// Whether this field has columnar fast-field storage
80    pub fast: bool,
81    /// Whether this field is a primary key (unique constraint)
82    pub primary: bool,
83}
84
85/// Parsed index definition
86#[derive(Debug, Clone)]
87pub struct IndexDef {
88    pub name: String,
89    pub fields: Vec<FieldDef>,
90    pub default_fields: Vec<String>,
91    /// Query router rules for routing queries to specific fields
92    pub query_routers: Vec<QueryRouterRule>,
93}
94
95impl IndexDef {
96    /// Convert to a Schema
97    pub fn to_schema(&self) -> Schema {
98        let mut builder = SchemaBuilder::default();
99
100        for field in &self.fields {
101            let f = match field.field_type {
102                FieldType::Text => {
103                    let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
104                    builder.add_text_field_with_tokenizer(
105                        &field.name,
106                        field.indexed,
107                        field.stored,
108                        tokenizer,
109                    )
110                }
111                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
112                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
113                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
114                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
115                FieldType::Json => builder.add_json_field(&field.name, field.stored),
116                FieldType::SparseVector => {
117                    if let Some(config) = &field.sparse_vector_config {
118                        builder.add_sparse_vector_field_with_config(
119                            &field.name,
120                            field.indexed,
121                            field.stored,
122                            config.clone(),
123                        )
124                    } else {
125                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
126                    }
127                }
128                FieldType::DenseVector => {
129                    // Dense vector dimension must be specified via config
130                    let config = field
131                        .dense_vector_config
132                        .as_ref()
133                        .expect("DenseVector field requires dimension to be specified");
134                    builder.add_dense_vector_field_with_config(
135                        &field.name,
136                        field.indexed,
137                        field.stored,
138                        config.clone(),
139                    )
140                }
141            };
142            if field.multi {
143                builder.set_multi(f, true);
144            }
145            if field.fast {
146                builder.set_fast(f, true);
147            }
148            if field.primary {
149                builder.set_primary_key(f);
150            }
151            // Set positions: explicit > auto (ordinal for multi vectors)
152            let positions = field.positions.or({
153                // Auto-set ordinal positions for multi-valued vector fields
154                if field.multi
155                    && matches!(
156                        field.field_type,
157                        FieldType::SparseVector | FieldType::DenseVector
158                    )
159                {
160                    Some(super::schema::PositionMode::Ordinal)
161                } else {
162                    None
163                }
164            });
165            if let Some(mode) = positions {
166                builder.set_positions(f, mode);
167            }
168        }
169
170        // Set default fields if specified
171        if !self.default_fields.is_empty() {
172            builder.set_default_fields(self.default_fields.clone());
173        }
174
175        // Set query routers if specified
176        if !self.query_routers.is_empty() {
177            builder.set_query_routers(self.query_routers.clone());
178        }
179
180        builder.build()
181    }
182
183    /// Create a QueryFieldRouter from the query router rules
184    ///
185    /// Returns None if there are no query router rules defined.
186    /// Returns Err if any regex pattern is invalid.
187    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
188        if self.query_routers.is_empty() {
189            return Ok(None);
190        }
191
192        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
193            .map(Some)
194            .map_err(Error::Schema)
195    }
196}
197
198/// Parse field type from string
199fn parse_field_type(type_str: &str) -> Result<FieldType> {
200    match type_str {
201        "text" | "string" | "str" => Ok(FieldType::Text),
202        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
203        "i64" | "int" | "integer" => Ok(FieldType::I64),
204        "f64" | "float" | "double" => Ok(FieldType::F64),
205        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
206        "json" => Ok(FieldType::Json),
207        "sparse_vector" => Ok(FieldType::SparseVector),
208        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
209        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
210    }
211}
212
213/// Index configuration parsed from indexed<...> attribute
214#[derive(Debug, Clone, Default)]
215struct IndexConfig {
216    index_type: Option<super::schema::VectorIndexType>,
217    num_clusters: Option<usize>,
218    nprobe: Option<usize>,
219    build_threshold: Option<usize>,
220    // Sparse vector index params
221    quantization: Option<WeightQuantization>,
222    weight_threshold: Option<f32>,
223    block_size: Option<usize>,
224    pruning: Option<f32>,
225    // Sparse vector query-time config
226    query_tokenizer: Option<String>,
227    query_weighting: Option<QueryWeighting>,
228    query_weight_threshold: Option<f32>,
229    query_max_dims: Option<usize>,
230    query_pruning: Option<f32>,
231    // Position tracking mode for phrase queries
232    positions: Option<super::schema::PositionMode>,
233}
234
235/// Parse attributes from pest pair
236/// Returns (indexed, stored, multi, fast, primary, index_config)
237/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
238/// multi is now inside stored<multi>
239fn parse_attributes(
240    pair: pest::iterators::Pair<Rule>,
241) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
242    let mut indexed = false;
243    let mut stored = false;
244    let mut multi = false;
245    let mut fast = false;
246    let mut primary = false;
247    let mut index_config = None;
248
249    for attr in pair.into_inner() {
250        if attr.as_rule() == Rule::attribute {
251            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" | "fast" | "primary" }
252            let mut found_config = false;
253            for inner in attr.clone().into_inner() {
254                match inner.as_rule() {
255                    Rule::indexed_with_config => {
256                        indexed = true;
257                        index_config = Some(parse_index_config(inner));
258                        found_config = true;
259                        break;
260                    }
261                    Rule::stored_with_config => {
262                        stored = true;
263                        multi = true; // stored<multi>
264                        found_config = true;
265                        break;
266                    }
267                    _ => {}
268                }
269            }
270            if !found_config {
271                // Simple attribute
272                match attr.as_str() {
273                    "indexed" => indexed = true,
274                    "stored" => stored = true,
275                    "fast" => fast = true,
276                    "primary" => primary = true,
277                    _ => {}
278                }
279            }
280        }
281    }
282
283    (indexed, stored, multi, fast, primary, index_config)
284}
285
286/// Parse index configuration from indexed<...> attribute
287fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
288    let mut config = IndexConfig::default();
289
290    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
291    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
292    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
293
294    for inner in pair.into_inner() {
295        if inner.as_rule() == Rule::index_config_params {
296            for param in inner.into_inner() {
297                if param.as_rule() == Rule::index_config_param {
298                    for p in param.into_inner() {
299                        parse_single_index_config_param(&mut config, p);
300                    }
301                }
302            }
303        }
304    }
305
306    config
307}
308
309/// Parse a single index config parameter
310fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
311    use super::schema::VectorIndexType;
312
313    match p.as_rule() {
314        Rule::index_type_spec => {
315            config.index_type = Some(match p.as_str() {
316                "flat" => VectorIndexType::Flat,
317                "rabitq" => VectorIndexType::RaBitQ,
318                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
319                "scann" => VectorIndexType::ScaNN,
320                _ => VectorIndexType::RaBitQ,
321            });
322        }
323        Rule::index_type_kwarg => {
324            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
325            if let Some(t) = p.into_inner().next() {
326                config.index_type = Some(match t.as_str() {
327                    "flat" => VectorIndexType::Flat,
328                    "rabitq" => VectorIndexType::RaBitQ,
329                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
330                    "scann" => VectorIndexType::ScaNN,
331                    _ => VectorIndexType::RaBitQ,
332                });
333            }
334        }
335        Rule::num_clusters_kwarg => {
336            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
337            if let Some(n) = p.into_inner().next() {
338                config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
339            }
340        }
341        Rule::build_threshold_kwarg => {
342            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
343            if let Some(n) = p.into_inner().next() {
344                config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
345            }
346        }
347        Rule::nprobe_kwarg => {
348            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
349            if let Some(n) = p.into_inner().next() {
350                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
351            }
352        }
353        Rule::quantization_kwarg => {
354            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
355            if let Some(q) = p.into_inner().next() {
356                config.quantization = Some(match q.as_str() {
357                    "float32" | "f32" => WeightQuantization::Float32,
358                    "float16" | "f16" => WeightQuantization::Float16,
359                    "uint8" | "u8" => WeightQuantization::UInt8,
360                    "uint4" | "u4" => WeightQuantization::UInt4,
361                    _ => WeightQuantization::default(),
362                });
363            }
364        }
365        Rule::weight_threshold_kwarg => {
366            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
367            if let Some(t) = p.into_inner().next() {
368                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
369            }
370        }
371        Rule::block_size_kwarg => {
372            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
373            if let Some(n) = p.into_inner().next() {
374                config.block_size = Some(n.as_str().parse().unwrap_or(128));
375            }
376        }
377        Rule::pruning_kwarg => {
378            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
379            if let Some(f) = p.into_inner().next() {
380                config.pruning = Some(f.as_str().parse().unwrap_or(1.0));
381            }
382        }
383        Rule::query_config_block => {
384            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
385            parse_query_config_block(config, p);
386        }
387        Rule::positions_kwarg => {
388            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
389            use super::schema::PositionMode;
390            config.positions = Some(match p.as_str() {
391                "ordinal" => PositionMode::Ordinal,
392                "token_position" => PositionMode::TokenPosition,
393                _ => PositionMode::Full, // "positions" or any other value defaults to Full
394            });
395        }
396        _ => {}
397    }
398}
399
400/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
401fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
402    for inner in pair.into_inner() {
403        if inner.as_rule() == Rule::query_config_params {
404            for param in inner.into_inner() {
405                if param.as_rule() == Rule::query_config_param {
406                    for p in param.into_inner() {
407                        match p.as_rule() {
408                            Rule::query_tokenizer_kwarg => {
409                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
410                                if let Some(path) = p.into_inner().next()
411                                    && let Some(inner_path) = path.into_inner().next()
412                                {
413                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
414                                }
415                            }
416                            Rule::query_weighting_kwarg => {
417                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
418                                if let Some(w) = p.into_inner().next() {
419                                    config.query_weighting = Some(match w.as_str() {
420                                        "one" => QueryWeighting::One,
421                                        "idf" => QueryWeighting::Idf,
422                                        "idf_file" => QueryWeighting::IdfFile,
423                                        _ => QueryWeighting::One,
424                                    });
425                                }
426                            }
427                            Rule::query_weight_threshold_kwarg => {
428                                if let Some(t) = p.into_inner().next() {
429                                    config.query_weight_threshold =
430                                        Some(t.as_str().parse().unwrap_or(0.0));
431                                }
432                            }
433                            Rule::query_max_dims_kwarg => {
434                                if let Some(t) = p.into_inner().next() {
435                                    config.query_max_dims = Some(t.as_str().parse().unwrap_or(0));
436                                }
437                            }
438                            Rule::query_pruning_kwarg => {
439                                if let Some(t) = p.into_inner().next() {
440                                    config.query_pruning = Some(t.as_str().parse().unwrap_or(1.0));
441                                }
442                            }
443                            _ => {}
444                        }
445                    }
446                }
447            }
448        }
449    }
450}
451
452/// Parse a field definition from pest pair
453fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
454    let mut inner = pair.into_inner();
455
456    let name = inner
457        .next()
458        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
459        .as_str()
460        .to_string();
461
462    let field_type_str = inner
463        .next()
464        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
465        .as_str();
466
467    let field_type = parse_field_type(field_type_str)?;
468
469    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
470    let mut tokenizer = None;
471    let mut sparse_vector_config = None;
472    let mut dense_vector_config = None;
473    let mut indexed = true;
474    let mut stored = true;
475    let mut multi = false;
476    let mut fast = false;
477    let mut primary = false;
478    let mut index_config: Option<IndexConfig> = None;
479
480    for item in inner {
481        match item.as_rule() {
482            Rule::tokenizer_spec => {
483                // Extract tokenizer name from <name>
484                if let Some(tok_name) = item.into_inner().next() {
485                    tokenizer = Some(tok_name.as_str().to_string());
486                }
487            }
488            Rule::sparse_vector_config => {
489                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
490                sparse_vector_config = Some(parse_sparse_vector_config(item));
491            }
492            Rule::dense_vector_config => {
493                // Parse dense_vector_params (keyword or positional) - only dims
494                dense_vector_config = Some(parse_dense_vector_config(item));
495            }
496            Rule::attributes => {
497                let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
498                indexed = idx;
499                stored = sto;
500                multi = mul;
501                fast = fst;
502                primary = pri;
503                index_config = idx_cfg;
504            }
505            _ => {}
506        }
507    }
508
509    // Primary key implies fast + indexed (needed for dedup lookups)
510    if primary {
511        fast = true;
512        indexed = true;
513    }
514
515    // Merge index config into vector configs if both exist
516    let mut positions = None;
517    if let Some(idx_cfg) = index_config {
518        positions = idx_cfg.positions;
519        if let Some(ref mut dv_config) = dense_vector_config {
520            apply_index_config_to_dense_vector(dv_config, idx_cfg);
521        } else if field_type == FieldType::SparseVector {
522            // For sparse vectors, create default config if not present and apply index params
523            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
524            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
525        }
526    }
527
528    Ok(FieldDef {
529        name,
530        field_type,
531        indexed,
532        stored,
533        tokenizer,
534        multi,
535        positions,
536        sparse_vector_config,
537        dense_vector_config,
538        fast,
539        primary,
540    })
541}
542
543/// Apply index configuration from indexed<...> to DenseVectorConfig
544fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
545    // Apply index type if specified
546    if let Some(index_type) = idx_cfg.index_type {
547        config.index_type = index_type;
548    }
549
550    // Apply num_clusters for IVF-based indexes
551    if idx_cfg.num_clusters.is_some() {
552        config.num_clusters = idx_cfg.num_clusters;
553    }
554
555    // Apply nprobe if specified
556    if let Some(nprobe) = idx_cfg.nprobe {
557        config.nprobe = nprobe;
558    }
559
560    // Apply build_threshold if specified
561    if idx_cfg.build_threshold.is_some() {
562        config.build_threshold = idx_cfg.build_threshold;
563    }
564}
565
566/// Parse sparse_vector_config - only index_size (positional)
567/// Example: <u16> or <u32>
568fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
569    let mut index_size = IndexSize::default();
570
571    // Parse positional index_size_spec
572    for inner in pair.into_inner() {
573        if inner.as_rule() == Rule::index_size_spec {
574            index_size = match inner.as_str() {
575                "u16" => IndexSize::U16,
576                "u32" => IndexSize::U32,
577                _ => IndexSize::default(),
578            };
579        }
580    }
581
582    SparseVectorConfig {
583        index_size,
584        weight_quantization: WeightQuantization::default(),
585        weight_threshold: 0.0,
586        block_size: 128,
587        pruning: None,
588        query_config: None,
589    }
590}
591
592/// Apply index configuration from indexed<...> to SparseVectorConfig
593fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
594    if let Some(q) = idx_cfg.quantization {
595        config.weight_quantization = q;
596    }
597    if let Some(t) = idx_cfg.weight_threshold {
598        config.weight_threshold = t;
599    }
600    if let Some(bs) = idx_cfg.block_size {
601        let adjusted = bs.next_power_of_two();
602        if adjusted != bs {
603            log::warn!(
604                "block_size {} adjusted to next power of two: {}",
605                bs,
606                adjusted
607            );
608        }
609        config.block_size = adjusted;
610    }
611    if let Some(p) = idx_cfg.pruning {
612        let clamped = p.clamp(0.0, 1.0);
613        if (clamped - p).abs() > f32::EPSILON {
614            log::warn!(
615                "pruning {} clamped to valid range [0.0, 1.0]: {}",
616                p,
617                clamped
618            );
619        }
620        config.pruning = Some(clamped);
621    }
622    // Apply query-time configuration if present
623    if idx_cfg.query_tokenizer.is_some()
624        || idx_cfg.query_weighting.is_some()
625        || idx_cfg.query_weight_threshold.is_some()
626        || idx_cfg.query_max_dims.is_some()
627        || idx_cfg.query_pruning.is_some()
628    {
629        let query_config = config
630            .query_config
631            .get_or_insert(SparseQueryConfig::default());
632        if let Some(tokenizer) = idx_cfg.query_tokenizer {
633            query_config.tokenizer = Some(tokenizer);
634        }
635        if let Some(weighting) = idx_cfg.query_weighting {
636            query_config.weighting = weighting;
637        }
638        if let Some(t) = idx_cfg.query_weight_threshold {
639            query_config.weight_threshold = t;
640        }
641        if let Some(d) = idx_cfg.query_max_dims {
642            query_config.max_query_dims = Some(d);
643        }
644        if let Some(p) = idx_cfg.query_pruning {
645            query_config.pruning = Some(p);
646        }
647    }
648}
649
650/// Parse dense_vector_config - dims and optional quantization type
651/// All index-related params are in indexed<...> attribute
652fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
653    let mut dim: usize = 0;
654    let mut quantization = DenseVectorQuantization::F32;
655
656    // Navigate to dense_vector_params
657    for params in pair.into_inner() {
658        if params.as_rule() == Rule::dense_vector_params {
659            for inner in params.into_inner() {
660                match inner.as_rule() {
661                    Rule::dense_vector_keyword_params => {
662                        for kwarg in inner.into_inner() {
663                            match kwarg.as_rule() {
664                                Rule::dims_kwarg => {
665                                    if let Some(d) = kwarg.into_inner().next() {
666                                        dim = d.as_str().parse().unwrap_or(0);
667                                    }
668                                }
669                                Rule::quant_type_spec => {
670                                    quantization = parse_quant_type(kwarg.as_str());
671                                }
672                                _ => {}
673                            }
674                        }
675                    }
676                    Rule::dense_vector_positional_params => {
677                        for item in inner.into_inner() {
678                            match item.as_rule() {
679                                Rule::dimension_spec => {
680                                    dim = item.as_str().parse().unwrap_or(0);
681                                }
682                                Rule::quant_type_spec => {
683                                    quantization = parse_quant_type(item.as_str());
684                                }
685                                _ => {}
686                            }
687                        }
688                    }
689                    _ => {}
690                }
691            }
692        }
693    }
694
695    DenseVectorConfig::new(dim).with_quantization(quantization)
696}
697
698fn parse_quant_type(s: &str) -> DenseVectorQuantization {
699    match s.trim() {
700        "f16" => DenseVectorQuantization::F16,
701        "uint8" | "u8" => DenseVectorQuantization::UInt8,
702        _ => DenseVectorQuantization::F32,
703    }
704}
705
706/// Parse default_fields definition
707fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
708    pair.into_inner().map(|p| p.as_str().to_string()).collect()
709}
710
711/// Parse a query router definition
712fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
713    let mut pattern = String::new();
714    let mut substitution = String::new();
715    let mut target_field = String::new();
716    let mut mode = RoutingMode::Additional;
717
718    for prop in pair.into_inner() {
719        if prop.as_rule() != Rule::query_router_prop {
720            continue;
721        }
722
723        for inner in prop.into_inner() {
724            match inner.as_rule() {
725                Rule::query_router_pattern => {
726                    if let Some(regex_str) = inner.into_inner().next() {
727                        pattern = parse_string_value(regex_str);
728                    }
729                }
730                Rule::query_router_substitution => {
731                    if let Some(quoted) = inner.into_inner().next() {
732                        substitution = parse_string_value(quoted);
733                    }
734                }
735                Rule::query_router_target => {
736                    if let Some(ident) = inner.into_inner().next() {
737                        target_field = ident.as_str().to_string();
738                    }
739                }
740                Rule::query_router_mode => {
741                    if let Some(mode_val) = inner.into_inner().next() {
742                        mode = match mode_val.as_str() {
743                            "exclusive" => RoutingMode::Exclusive,
744                            "additional" => RoutingMode::Additional,
745                            _ => RoutingMode::Additional,
746                        };
747                    }
748                }
749                _ => {}
750            }
751        }
752    }
753
754    if pattern.is_empty() {
755        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
756    }
757    if substitution.is_empty() {
758        return Err(Error::Schema(
759            "query_router missing 'substitution'".to_string(),
760        ));
761    }
762    if target_field.is_empty() {
763        return Err(Error::Schema(
764            "query_router missing 'target_field'".to_string(),
765        ));
766    }
767
768    Ok(QueryRouterRule {
769        pattern,
770        substitution,
771        target_field,
772        mode,
773    })
774}
775
776/// Parse a string value from quoted_string, raw_string, or regex_string
777fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
778    let s = pair.as_str();
779    match pair.as_rule() {
780        Rule::regex_string => {
781            // regex_string contains either raw_string or quoted_string
782            if let Some(inner) = pair.into_inner().next() {
783                parse_string_value(inner)
784            } else {
785                s.to_string()
786            }
787        }
788        Rule::raw_string => {
789            // r"..." - strip r" prefix and " suffix
790            s[2..s.len() - 1].to_string()
791        }
792        Rule::quoted_string => {
793            // "..." - strip quotes and handle escapes
794            let inner = &s[1..s.len() - 1];
795            // Simple escape handling
796            inner
797                .replace("\\n", "\n")
798                .replace("\\t", "\t")
799                .replace("\\\"", "\"")
800                .replace("\\\\", "\\")
801        }
802        _ => s.to_string(),
803    }
804}
805
806/// Parse an index definition from pest pair
807fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
808    let mut inner = pair.into_inner();
809
810    let name = inner
811        .next()
812        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
813        .as_str()
814        .to_string();
815
816    let mut fields = Vec::new();
817    let mut default_fields = Vec::new();
818    let mut query_routers = Vec::new();
819
820    for item in inner {
821        match item.as_rule() {
822            Rule::field_def => {
823                fields.push(parse_field_def(item)?);
824            }
825            Rule::default_fields_def => {
826                default_fields = parse_default_fields_def(item);
827            }
828            Rule::query_router_def => {
829                query_routers.push(parse_query_router_def(item)?);
830            }
831            _ => {}
832        }
833    }
834
835    // Validate primary key constraints
836    let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
837    if primary_fields.len() > 1 {
838        return Err(Error::Schema(format!(
839            "Index '{}' has {} primary key fields, but at most one is allowed",
840            name,
841            primary_fields.len()
842        )));
843    }
844    if let Some(pk) = primary_fields.first() {
845        if pk.field_type != FieldType::Text {
846            return Err(Error::Schema(format!(
847                "Primary key field '{}' must be of type text, got {:?}",
848                pk.name, pk.field_type
849            )));
850        }
851        if pk.multi {
852            return Err(Error::Schema(format!(
853                "Primary key field '{}' cannot be multi-valued",
854                pk.name
855            )));
856        }
857    }
858
859    Ok(IndexDef {
860        name,
861        fields,
862        default_fields,
863        query_routers,
864    })
865}
866
867/// Parse SDL from a string
868pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
869    let pairs = SdlParser::parse(Rule::file, input)
870        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
871
872    let mut indexes = Vec::new();
873
874    for pair in pairs {
875        if pair.as_rule() == Rule::file {
876            for inner in pair.into_inner() {
877                if inner.as_rule() == Rule::index_def {
878                    indexes.push(parse_index_def(inner)?);
879                }
880            }
881        }
882    }
883
884    Ok(indexes)
885}
886
887/// Parse SDL and return a single index definition
888pub fn parse_single_index(input: &str) -> Result<IndexDef> {
889    let indexes = parse_sdl(input)?;
890
891    if indexes.is_empty() {
892        return Err(Error::Schema("No index definition found".to_string()));
893    }
894
895    if indexes.len() > 1 {
896        return Err(Error::Schema(
897            "Multiple index definitions found, expected one".to_string(),
898        ));
899    }
900
901    Ok(indexes.into_iter().next().unwrap())
902}
903
904#[cfg(test)]
905mod tests {
906    use super::*;
907
908    #[test]
909    fn test_parse_simple_schema() {
910        let sdl = r#"
911            index articles {
912                field title: text [indexed, stored]
913                field body: text [indexed]
914            }
915        "#;
916
917        let indexes = parse_sdl(sdl).unwrap();
918        assert_eq!(indexes.len(), 1);
919
920        let index = &indexes[0];
921        assert_eq!(index.name, "articles");
922        assert_eq!(index.fields.len(), 2);
923
924        assert_eq!(index.fields[0].name, "title");
925        assert!(matches!(index.fields[0].field_type, FieldType::Text));
926        assert!(index.fields[0].indexed);
927        assert!(index.fields[0].stored);
928
929        assert_eq!(index.fields[1].name, "body");
930        assert!(matches!(index.fields[1].field_type, FieldType::Text));
931        assert!(index.fields[1].indexed);
932        assert!(!index.fields[1].stored);
933    }
934
935    #[test]
936    fn test_parse_all_field_types() {
937        let sdl = r#"
938            index test {
939                field text_field: text [indexed, stored]
940                field u64_field: u64 [indexed, stored]
941                field i64_field: i64 [indexed, stored]
942                field f64_field: f64 [indexed, stored]
943                field bytes_field: bytes [stored]
944            }
945        "#;
946
947        let indexes = parse_sdl(sdl).unwrap();
948        let index = &indexes[0];
949
950        assert!(matches!(index.fields[0].field_type, FieldType::Text));
951        assert!(matches!(index.fields[1].field_type, FieldType::U64));
952        assert!(matches!(index.fields[2].field_type, FieldType::I64));
953        assert!(matches!(index.fields[3].field_type, FieldType::F64));
954        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
955    }
956
957    #[test]
958    fn test_parse_with_comments() {
959        let sdl = r#"
960            # This is a comment
961            index articles {
962                # Title field
963                field title: text [indexed, stored]
964                field body: text [indexed] # inline comment not supported yet
965            }
966        "#;
967
968        let indexes = parse_sdl(sdl).unwrap();
969        assert_eq!(indexes[0].fields.len(), 2);
970    }
971
972    #[test]
973    fn test_parse_type_aliases() {
974        let sdl = r#"
975            index test {
976                field a: string [indexed]
977                field b: int [indexed]
978                field c: uint [indexed]
979                field d: float [indexed]
980                field e: binary [stored]
981            }
982        "#;
983
984        let indexes = parse_sdl(sdl).unwrap();
985        let index = &indexes[0];
986
987        assert!(matches!(index.fields[0].field_type, FieldType::Text));
988        assert!(matches!(index.fields[1].field_type, FieldType::I64));
989        assert!(matches!(index.fields[2].field_type, FieldType::U64));
990        assert!(matches!(index.fields[3].field_type, FieldType::F64));
991        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
992    }
993
994    #[test]
995    fn test_to_schema() {
996        let sdl = r#"
997            index articles {
998                field title: text [indexed, stored]
999                field views: u64 [indexed, stored]
1000            }
1001        "#;
1002
1003        let indexes = parse_sdl(sdl).unwrap();
1004        let schema = indexes[0].to_schema();
1005
1006        assert!(schema.get_field("title").is_some());
1007        assert!(schema.get_field("views").is_some());
1008        assert!(schema.get_field("nonexistent").is_none());
1009    }
1010
1011    #[test]
1012    fn test_default_attributes() {
1013        let sdl = r#"
1014            index test {
1015                field title: text
1016            }
1017        "#;
1018
1019        let indexes = parse_sdl(sdl).unwrap();
1020        let field = &indexes[0].fields[0];
1021
1022        // Default should be indexed and stored
1023        assert!(field.indexed);
1024        assert!(field.stored);
1025    }
1026
1027    #[test]
1028    fn test_multiple_indexes() {
1029        let sdl = r#"
1030            index articles {
1031                field title: text [indexed, stored]
1032            }
1033
1034            index users {
1035                field name: text [indexed, stored]
1036                field email: text [indexed, stored]
1037            }
1038        "#;
1039
1040        let indexes = parse_sdl(sdl).unwrap();
1041        assert_eq!(indexes.len(), 2);
1042        assert_eq!(indexes[0].name, "articles");
1043        assert_eq!(indexes[1].name, "users");
1044    }
1045
1046    #[test]
1047    fn test_tokenizer_spec() {
1048        let sdl = r#"
1049            index articles {
1050                field title: text<en_stem> [indexed, stored]
1051                field body: text<simple> [indexed]
1052                field author: text [indexed, stored]
1053            }
1054        "#;
1055
1056        let indexes = parse_sdl(sdl).unwrap();
1057        let index = &indexes[0];
1058
1059        assert_eq!(index.fields[0].name, "title");
1060        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1061
1062        assert_eq!(index.fields[1].name, "body");
1063        assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1064
1065        assert_eq!(index.fields[2].name, "author");
1066        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1067    }
1068
1069    #[test]
1070    fn test_tokenizer_in_schema() {
1071        let sdl = r#"
1072            index articles {
1073                field title: text<german> [indexed, stored]
1074                field body: text<en_stem> [indexed]
1075            }
1076        "#;
1077
1078        let indexes = parse_sdl(sdl).unwrap();
1079        let schema = indexes[0].to_schema();
1080
1081        let title_field = schema.get_field("title").unwrap();
1082        let title_entry = schema.get_field_entry(title_field).unwrap();
1083        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1084
1085        let body_field = schema.get_field("body").unwrap();
1086        let body_entry = schema.get_field_entry(body_field).unwrap();
1087        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1088    }
1089
1090    #[test]
1091    fn test_query_router_basic() {
1092        let sdl = r#"
1093            index documents {
1094                field title: text [indexed, stored]
1095                field uri: text [indexed, stored]
1096
1097                query_router {
1098                    pattern: "10\\.\\d{4,}/[^\\s]+"
1099                    substitution: "doi://{0}"
1100                    target_field: uris
1101                    mode: exclusive
1102                }
1103            }
1104        "#;
1105
1106        let indexes = parse_sdl(sdl).unwrap();
1107        let index = &indexes[0];
1108
1109        assert_eq!(index.query_routers.len(), 1);
1110        let router = &index.query_routers[0];
1111        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1112        assert_eq!(router.substitution, "doi://{0}");
1113        assert_eq!(router.target_field, "uris");
1114        assert_eq!(router.mode, RoutingMode::Exclusive);
1115    }
1116
1117    #[test]
1118    fn test_query_router_raw_string() {
1119        let sdl = r#"
1120            index documents {
1121                field uris: text [indexed, stored]
1122
1123                query_router {
1124                    pattern: r"^pmid:(\d+)$"
1125                    substitution: "pubmed://{1}"
1126                    target_field: uris
1127                    mode: additional
1128                }
1129            }
1130        "#;
1131
1132        let indexes = parse_sdl(sdl).unwrap();
1133        let router = &indexes[0].query_routers[0];
1134
1135        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1136        assert_eq!(router.substitution, "pubmed://{1}");
1137        assert_eq!(router.mode, RoutingMode::Additional);
1138    }
1139
1140    #[test]
1141    fn test_multiple_query_routers() {
1142        let sdl = r#"
1143            index documents {
1144                field uris: text [indexed, stored]
1145
1146                query_router {
1147                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1148                    substitution: "doi://{1}"
1149                    target_field: uris
1150                    mode: exclusive
1151                }
1152
1153                query_router {
1154                    pattern: r"^pmid:(\d+)$"
1155                    substitution: "pubmed://{1}"
1156                    target_field: uris
1157                    mode: exclusive
1158                }
1159
1160                query_router {
1161                    pattern: r"^arxiv:(\d+\.\d+)$"
1162                    substitution: "arxiv://{1}"
1163                    target_field: uris
1164                    mode: additional
1165                }
1166            }
1167        "#;
1168
1169        let indexes = parse_sdl(sdl).unwrap();
1170        assert_eq!(indexes[0].query_routers.len(), 3);
1171    }
1172
1173    #[test]
1174    fn test_query_router_default_mode() {
1175        let sdl = r#"
1176            index documents {
1177                field uris: text [indexed, stored]
1178
1179                query_router {
1180                    pattern: r"test"
1181                    substitution: "{0}"
1182                    target_field: uris
1183                }
1184            }
1185        "#;
1186
1187        let indexes = parse_sdl(sdl).unwrap();
1188        // Default mode should be Additional
1189        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1190    }
1191
1192    #[test]
1193    fn test_multi_attribute() {
1194        let sdl = r#"
1195            index documents {
1196                field uris: text [indexed, stored<multi>]
1197                field title: text [indexed, stored]
1198            }
1199        "#;
1200
1201        let indexes = parse_sdl(sdl).unwrap();
1202        assert_eq!(indexes.len(), 1);
1203
1204        let fields = &indexes[0].fields;
1205        assert_eq!(fields.len(), 2);
1206
1207        // uris should have multi=true
1208        assert_eq!(fields[0].name, "uris");
1209        assert!(fields[0].multi, "uris field should have multi=true");
1210
1211        // title should have multi=false
1212        assert_eq!(fields[1].name, "title");
1213        assert!(!fields[1].multi, "title field should have multi=false");
1214
1215        // Verify schema conversion preserves multi attribute
1216        let schema = indexes[0].to_schema();
1217        let uris_field = schema.get_field("uris").unwrap();
1218        let title_field = schema.get_field("title").unwrap();
1219
1220        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1221        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1222    }
1223
1224    #[test]
1225    fn test_sparse_vector_field() {
1226        let sdl = r#"
1227            index documents {
1228                field embedding: sparse_vector [indexed, stored]
1229            }
1230        "#;
1231
1232        let indexes = parse_sdl(sdl).unwrap();
1233        assert_eq!(indexes.len(), 1);
1234        assert_eq!(indexes[0].fields.len(), 1);
1235        assert_eq!(indexes[0].fields[0].name, "embedding");
1236        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1237        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1238    }
1239
1240    #[test]
1241    fn test_sparse_vector_with_config() {
1242        let sdl = r#"
1243            index documents {
1244                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1245                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1246            }
1247        "#;
1248
1249        let indexes = parse_sdl(sdl).unwrap();
1250        assert_eq!(indexes[0].fields.len(), 2);
1251
1252        // First field: u16 indices, uint8 quantization
1253        let f1 = &indexes[0].fields[0];
1254        assert_eq!(f1.name, "embedding");
1255        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1256        assert_eq!(config1.index_size, IndexSize::U16);
1257        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1258
1259        // Second field: u32 indices, float32 quantization
1260        let f2 = &indexes[0].fields[1];
1261        assert_eq!(f2.name, "dense");
1262        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1263        assert_eq!(config2.index_size, IndexSize::U32);
1264        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1265    }
1266
1267    #[test]
1268    fn test_sparse_vector_with_weight_threshold() {
1269        let sdl = r#"
1270            index documents {
1271                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1272                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1273            }
1274        "#;
1275
1276        let indexes = parse_sdl(sdl).unwrap();
1277        assert_eq!(indexes[0].fields.len(), 2);
1278
1279        // First field: u16 indices, uint8 quantization, threshold 0.1
1280        let f1 = &indexes[0].fields[0];
1281        assert_eq!(f1.name, "embedding");
1282        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1283        assert_eq!(config1.index_size, IndexSize::U16);
1284        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1285        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1286
1287        // Second field: u32 indices, float16 quantization, threshold 0.05
1288        let f2 = &indexes[0].fields[1];
1289        assert_eq!(f2.name, "embedding2");
1290        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1291        assert_eq!(config2.index_size, IndexSize::U32);
1292        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1293        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1294    }
1295
1296    #[test]
1297    fn test_sparse_vector_with_pruning() {
1298        let sdl = r#"
1299            index documents {
1300                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1301            }
1302        "#;
1303
1304        let indexes = parse_sdl(sdl).unwrap();
1305        let f = &indexes[0].fields[0];
1306        assert_eq!(f.name, "embedding");
1307        let config = f.sparse_vector_config.as_ref().unwrap();
1308        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1309        assert_eq!(config.pruning, Some(0.1));
1310    }
1311
1312    #[test]
1313    fn test_dense_vector_field() {
1314        let sdl = r#"
1315            index documents {
1316                field embedding: dense_vector<768> [indexed, stored]
1317            }
1318        "#;
1319
1320        let indexes = parse_sdl(sdl).unwrap();
1321        assert_eq!(indexes.len(), 1);
1322        assert_eq!(indexes[0].fields.len(), 1);
1323
1324        let f = &indexes[0].fields[0];
1325        assert_eq!(f.name, "embedding");
1326        assert_eq!(f.field_type, FieldType::DenseVector);
1327
1328        let config = f.dense_vector_config.as_ref().unwrap();
1329        assert_eq!(config.dim, 768);
1330    }
1331
1332    #[test]
1333    fn test_dense_vector_alias() {
1334        let sdl = r#"
1335            index documents {
1336                field embedding: vector<1536> [indexed]
1337            }
1338        "#;
1339
1340        let indexes = parse_sdl(sdl).unwrap();
1341        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1342        assert_eq!(
1343            indexes[0].fields[0]
1344                .dense_vector_config
1345                .as_ref()
1346                .unwrap()
1347                .dim,
1348            1536
1349        );
1350    }
1351
1352    #[test]
1353    fn test_dense_vector_with_num_clusters() {
1354        let sdl = r#"
1355            index documents {
1356                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1357            }
1358        "#;
1359
1360        let indexes = parse_sdl(sdl).unwrap();
1361        assert_eq!(indexes.len(), 1);
1362
1363        let f = &indexes[0].fields[0];
1364        assert_eq!(f.name, "embedding");
1365        assert_eq!(f.field_type, FieldType::DenseVector);
1366
1367        let config = f.dense_vector_config.as_ref().unwrap();
1368        assert_eq!(config.dim, 768);
1369        assert_eq!(config.num_clusters, Some(256));
1370        assert_eq!(config.nprobe, 32); // default
1371    }
1372
1373    #[test]
1374    fn test_dense_vector_with_num_clusters_and_nprobe() {
1375        let sdl = r#"
1376            index documents {
1377                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1378            }
1379        "#;
1380
1381        let indexes = parse_sdl(sdl).unwrap();
1382        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1383
1384        assert_eq!(config.dim, 1536);
1385        assert_eq!(config.num_clusters, Some(512));
1386        assert_eq!(config.nprobe, 64);
1387    }
1388
1389    #[test]
1390    fn test_dense_vector_keyword_syntax() {
1391        let sdl = r#"
1392            index documents {
1393                field embedding: dense_vector<dims: 1536> [indexed, stored]
1394            }
1395        "#;
1396
1397        let indexes = parse_sdl(sdl).unwrap();
1398        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1399
1400        assert_eq!(config.dim, 1536);
1401        assert!(config.num_clusters.is_none());
1402    }
1403
1404    #[test]
1405    fn test_dense_vector_keyword_syntax_full() {
1406        let sdl = r#"
1407            index documents {
1408                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1409            }
1410        "#;
1411
1412        let indexes = parse_sdl(sdl).unwrap();
1413        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1414
1415        assert_eq!(config.dim, 1536);
1416        assert_eq!(config.num_clusters, Some(256));
1417        assert_eq!(config.nprobe, 64);
1418    }
1419
1420    #[test]
1421    fn test_dense_vector_keyword_syntax_partial() {
1422        let sdl = r#"
1423            index documents {
1424                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1425            }
1426        "#;
1427
1428        let indexes = parse_sdl(sdl).unwrap();
1429        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1430
1431        assert_eq!(config.dim, 768);
1432        assert_eq!(config.num_clusters, Some(128));
1433        assert_eq!(config.nprobe, 32); // default
1434    }
1435
1436    #[test]
1437    fn test_dense_vector_scann_index() {
1438        use crate::dsl::schema::VectorIndexType;
1439
1440        let sdl = r#"
1441            index documents {
1442                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1443            }
1444        "#;
1445
1446        let indexes = parse_sdl(sdl).unwrap();
1447        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1448
1449        assert_eq!(config.dim, 768);
1450        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1451        assert_eq!(config.num_clusters, Some(256));
1452        assert_eq!(config.nprobe, 64);
1453    }
1454
1455    #[test]
1456    fn test_dense_vector_ivf_rabitq_index() {
1457        use crate::dsl::schema::VectorIndexType;
1458
1459        let sdl = r#"
1460            index documents {
1461                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1462            }
1463        "#;
1464
1465        let indexes = parse_sdl(sdl).unwrap();
1466        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1467
1468        assert_eq!(config.dim, 1536);
1469        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1470        assert_eq!(config.num_clusters, Some(512));
1471    }
1472
1473    #[test]
1474    fn test_dense_vector_rabitq_no_clusters() {
1475        use crate::dsl::schema::VectorIndexType;
1476
1477        let sdl = r#"
1478            index documents {
1479                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1480            }
1481        "#;
1482
1483        let indexes = parse_sdl(sdl).unwrap();
1484        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1485
1486        assert_eq!(config.dim, 768);
1487        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1488        assert!(config.num_clusters.is_none());
1489    }
1490
1491    #[test]
1492    fn test_dense_vector_flat_index() {
1493        use crate::dsl::schema::VectorIndexType;
1494
1495        let sdl = r#"
1496            index documents {
1497                field embedding: dense_vector<dims: 768> [indexed<flat>]
1498            }
1499        "#;
1500
1501        let indexes = parse_sdl(sdl).unwrap();
1502        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1503
1504        assert_eq!(config.dim, 768);
1505        assert_eq!(config.index_type, VectorIndexType::Flat);
1506    }
1507
1508    #[test]
1509    fn test_dense_vector_default_index_type() {
1510        use crate::dsl::schema::VectorIndexType;
1511
1512        // When no index type specified, should default to RaBitQ (basic)
1513        let sdl = r#"
1514            index documents {
1515                field embedding: dense_vector<dims: 768> [indexed]
1516            }
1517        "#;
1518
1519        let indexes = parse_sdl(sdl).unwrap();
1520        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1521
1522        assert_eq!(config.dim, 768);
1523        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1524    }
1525
1526    #[test]
1527    fn test_dense_vector_f16_quantization() {
1528        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1529
1530        let sdl = r#"
1531            index documents {
1532                field embedding: dense_vector<768, f16> [indexed]
1533            }
1534        "#;
1535
1536        let indexes = parse_sdl(sdl).unwrap();
1537        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1538
1539        assert_eq!(config.dim, 768);
1540        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1541        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1542    }
1543
1544    #[test]
1545    fn test_dense_vector_uint8_quantization() {
1546        use crate::dsl::schema::DenseVectorQuantization;
1547
1548        let sdl = r#"
1549            index documents {
1550                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1551            }
1552        "#;
1553
1554        let indexes = parse_sdl(sdl).unwrap();
1555        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1556
1557        assert_eq!(config.dim, 1024);
1558        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1559    }
1560
1561    #[test]
1562    fn test_dense_vector_u8_alias() {
1563        use crate::dsl::schema::DenseVectorQuantization;
1564
1565        let sdl = r#"
1566            index documents {
1567                field embedding: dense_vector<512, u8> [indexed]
1568            }
1569        "#;
1570
1571        let indexes = parse_sdl(sdl).unwrap();
1572        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1573
1574        assert_eq!(config.dim, 512);
1575        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1576    }
1577
1578    #[test]
1579    fn test_dense_vector_default_f32_quantization() {
1580        use crate::dsl::schema::DenseVectorQuantization;
1581
1582        // No quantization type → default f32
1583        let sdl = r#"
1584            index documents {
1585                field embedding: dense_vector<768> [indexed]
1586            }
1587        "#;
1588
1589        let indexes = parse_sdl(sdl).unwrap();
1590        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1591
1592        assert_eq!(config.dim, 768);
1593        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1594    }
1595
1596    #[test]
1597    fn test_dense_vector_keyword_with_quantization() {
1598        use crate::dsl::schema::DenseVectorQuantization;
1599
1600        let sdl = r#"
1601            index documents {
1602                field embedding: dense_vector<dims: 768, f16> [indexed]
1603            }
1604        "#;
1605
1606        let indexes = parse_sdl(sdl).unwrap();
1607        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1608
1609        assert_eq!(config.dim, 768);
1610        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1611    }
1612
1613    #[test]
1614    fn test_json_field_type() {
1615        let sdl = r#"
1616            index documents {
1617                field title: text [indexed, stored]
1618                field metadata: json [stored]
1619                field extra: json
1620            }
1621        "#;
1622
1623        let indexes = parse_sdl(sdl).unwrap();
1624        let index = &indexes[0];
1625
1626        assert_eq!(index.fields.len(), 3);
1627
1628        // Check JSON field
1629        assert_eq!(index.fields[1].name, "metadata");
1630        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1631        assert!(index.fields[1].stored);
1632        // JSON fields should not be indexed (enforced by add_json_field)
1633
1634        // Check default attributes for JSON field
1635        assert_eq!(index.fields[2].name, "extra");
1636        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1637
1638        // Verify schema conversion
1639        let schema = index.to_schema();
1640        let metadata_field = schema.get_field("metadata").unwrap();
1641        let entry = schema.get_field_entry(metadata_field).unwrap();
1642        assert_eq!(entry.field_type, FieldType::Json);
1643        assert!(!entry.indexed); // JSON fields are never indexed
1644        assert!(entry.stored);
1645    }
1646
1647    #[test]
1648    fn test_sparse_vector_query_config() {
1649        use crate::structures::QueryWeighting;
1650
1651        let sdl = r#"
1652            index documents {
1653                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1654            }
1655        "#;
1656
1657        let indexes = parse_sdl(sdl).unwrap();
1658        let index = &indexes[0];
1659
1660        assert_eq!(index.fields.len(), 1);
1661        assert_eq!(index.fields[0].name, "embedding");
1662        assert!(matches!(
1663            index.fields[0].field_type,
1664            FieldType::SparseVector
1665        ));
1666
1667        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1668        assert_eq!(config.index_size, IndexSize::U16);
1669        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1670
1671        // Check query config
1672        let query_config = config.query_config.as_ref().unwrap();
1673        assert_eq!(
1674            query_config.tokenizer.as_deref(),
1675            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1676        );
1677        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1678
1679        // Verify schema conversion preserves query config
1680        let schema = index.to_schema();
1681        let embedding_field = schema.get_field("embedding").unwrap();
1682        let entry = schema.get_field_entry(embedding_field).unwrap();
1683        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1684        let qc = sv_config.query_config.as_ref().unwrap();
1685        assert_eq!(
1686            qc.tokenizer.as_deref(),
1687            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1688        );
1689        assert_eq!(qc.weighting, QueryWeighting::Idf);
1690    }
1691
1692    #[test]
1693    fn test_sparse_vector_query_config_weighting_one() {
1694        use crate::structures::QueryWeighting;
1695
1696        let sdl = r#"
1697            index documents {
1698                field embedding: sparse_vector [indexed<query<weighting: one>>]
1699            }
1700        "#;
1701
1702        let indexes = parse_sdl(sdl).unwrap();
1703        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1704
1705        let query_config = config.query_config.as_ref().unwrap();
1706        assert!(query_config.tokenizer.is_none());
1707        assert_eq!(query_config.weighting, QueryWeighting::One);
1708    }
1709
1710    #[test]
1711    fn test_sparse_vector_query_config_weighting_idf_file() {
1712        use crate::structures::QueryWeighting;
1713
1714        let sdl = r#"
1715            index documents {
1716                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1717            }
1718        "#;
1719
1720        let indexes = parse_sdl(sdl).unwrap();
1721        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1722
1723        let query_config = config.query_config.as_ref().unwrap();
1724        assert_eq!(
1725            query_config.tokenizer.as_deref(),
1726            Some("opensearch-neural-sparse-encoding-v1")
1727        );
1728        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1729
1730        // Verify schema conversion preserves idf_file
1731        let schema = indexes[0].to_schema();
1732        let field = schema.get_field("embedding").unwrap();
1733        let entry = schema.get_field_entry(field).unwrap();
1734        let sc = entry.sparse_vector_config.as_ref().unwrap();
1735        let qc = sc.query_config.as_ref().unwrap();
1736        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1737    }
1738
1739    #[test]
1740    fn test_sparse_vector_query_config_pruning_params() {
1741        let sdl = r#"
1742            index documents {
1743                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1744            }
1745        "#;
1746
1747        let indexes = parse_sdl(sdl).unwrap();
1748        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1749
1750        let qc = config.query_config.as_ref().unwrap();
1751        assert_eq!(qc.weighting, QueryWeighting::Idf);
1752        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1753        assert_eq!(qc.max_query_dims, Some(25));
1754        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1755
1756        // Verify schema roundtrip
1757        let schema = indexes[0].to_schema();
1758        let field = schema.get_field("embedding").unwrap();
1759        let entry = schema.get_field_entry(field).unwrap();
1760        let sc = entry.sparse_vector_config.as_ref().unwrap();
1761        let rqc = sc.query_config.as_ref().unwrap();
1762        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1763        assert_eq!(rqc.max_query_dims, Some(25));
1764        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1765    }
1766
1767    #[test]
1768    fn test_fast_attribute() {
1769        let sdl = r#"
1770            index products {
1771                field name: text [indexed, stored]
1772                field price: f64 [indexed, fast]
1773                field category: text [indexed, stored, fast]
1774                field count: u64 [fast]
1775                field score: i64 [indexed, stored, fast]
1776            }
1777        "#;
1778
1779        let indexes = parse_sdl(sdl).unwrap();
1780        assert_eq!(indexes.len(), 1);
1781        let index = &indexes[0];
1782        assert_eq!(index.fields.len(), 5);
1783
1784        // name: no fast
1785        assert!(!index.fields[0].fast);
1786        // price: fast
1787        assert!(index.fields[1].fast);
1788        assert!(matches!(index.fields[1].field_type, FieldType::F64));
1789        // category: fast text
1790        assert!(index.fields[2].fast);
1791        assert!(matches!(index.fields[2].field_type, FieldType::Text));
1792        // count: fast only
1793        assert!(index.fields[3].fast);
1794        assert!(matches!(index.fields[3].field_type, FieldType::U64));
1795        // score: fast i64
1796        assert!(index.fields[4].fast);
1797        assert!(matches!(index.fields[4].field_type, FieldType::I64));
1798
1799        // Verify schema roundtrip preserves fast flag
1800        let schema = index.to_schema();
1801        let price_field = schema.get_field("price").unwrap();
1802        assert!(schema.get_field_entry(price_field).unwrap().fast);
1803
1804        let category_field = schema.get_field("category").unwrap();
1805        assert!(schema.get_field_entry(category_field).unwrap().fast);
1806
1807        let name_field = schema.get_field("name").unwrap();
1808        assert!(!schema.get_field_entry(name_field).unwrap().fast);
1809    }
1810
1811    #[test]
1812    fn test_primary_attribute() {
1813        let sdl = r#"
1814            index documents {
1815                field id: text [primary, stored]
1816                field title: text [indexed, stored]
1817            }
1818        "#;
1819
1820        let indexes = parse_sdl(sdl).unwrap();
1821        assert_eq!(indexes.len(), 1);
1822        let index = &indexes[0];
1823        assert_eq!(index.fields.len(), 2);
1824
1825        // id should be primary, and auto-set fast + indexed
1826        let id_field = &index.fields[0];
1827        assert!(id_field.primary, "id should be primary");
1828        assert!(id_field.fast, "primary implies fast");
1829        assert!(id_field.indexed, "primary implies indexed");
1830
1831        // title should NOT be primary
1832        assert!(!index.fields[1].primary);
1833
1834        // Verify schema conversion preserves primary_key
1835        let schema = index.to_schema();
1836        let id = schema.get_field("id").unwrap();
1837        let id_entry = schema.get_field_entry(id).unwrap();
1838        assert!(id_entry.primary_key);
1839        assert!(id_entry.fast);
1840        assert!(id_entry.indexed);
1841
1842        let title = schema.get_field("title").unwrap();
1843        assert!(!schema.get_field_entry(title).unwrap().primary_key);
1844
1845        // primary_field() should return the primary field
1846        assert_eq!(schema.primary_field(), Some(id));
1847    }
1848
1849    #[test]
1850    fn test_primary_with_other_attributes() {
1851        let sdl = r#"
1852            index documents {
1853                field id: text<simple> [primary, indexed, stored]
1854                field body: text [indexed]
1855            }
1856        "#;
1857
1858        let indexes = parse_sdl(sdl).unwrap();
1859        let id_field = &indexes[0].fields[0];
1860        assert!(id_field.primary);
1861        assert!(id_field.indexed);
1862        assert!(id_field.stored);
1863        assert!(id_field.fast);
1864        assert_eq!(id_field.tokenizer, Some("simple".to_string()));
1865    }
1866
1867    #[test]
1868    fn test_primary_only_one_allowed() {
1869        let sdl = r#"
1870            index documents {
1871                field id: text [primary]
1872                field alt_id: text [primary]
1873            }
1874        "#;
1875
1876        let result = parse_sdl(sdl);
1877        assert!(result.is_err());
1878        let err = result.unwrap_err().to_string();
1879        assert!(
1880            err.contains("primary key"),
1881            "Error should mention primary key: {}",
1882            err
1883        );
1884    }
1885
1886    #[test]
1887    fn test_primary_must_be_text() {
1888        let sdl = r#"
1889            index documents {
1890                field id: u64 [primary]
1891            }
1892        "#;
1893
1894        let result = parse_sdl(sdl);
1895        assert!(result.is_err());
1896        let err = result.unwrap_err().to_string();
1897        assert!(
1898            err.contains("text"),
1899            "Error should mention text type: {}",
1900            err
1901        );
1902    }
1903
1904    #[test]
1905    fn test_primary_cannot_be_multi() {
1906        let sdl = r#"
1907            index documents {
1908                field id: text [primary, stored<multi>]
1909            }
1910        "#;
1911
1912        let result = parse_sdl(sdl);
1913        assert!(result.is_err());
1914        let err = result.unwrap_err().to_string();
1915        assert!(err.contains("multi"), "Error should mention multi: {}", err);
1916    }
1917
1918    #[test]
1919    fn test_no_primary_field() {
1920        // Schema without primary field should work fine
1921        let sdl = r#"
1922            index documents {
1923                field title: text [indexed, stored]
1924            }
1925        "#;
1926
1927        let indexes = parse_sdl(sdl).unwrap();
1928        let schema = indexes[0].to_schema();
1929        assert!(schema.primary_field().is_none());
1930    }
1931}