Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62/// Parsed field definition
63#[derive(Debug, Clone)]
64pub struct FieldDef {
65    pub name: String,
66    pub field_type: FieldType,
67    pub indexed: bool,
68    pub stored: bool,
69    /// Tokenizer name for text fields (e.g., "simple", "en_stem", "german")
70    pub tokenizer: Option<String>,
71    /// Whether this field can have multiple values (serialized as array in JSON)
72    pub multi: bool,
73    /// Position tracking mode for phrase queries and multi-field element tracking
74    pub positions: Option<super::schema::PositionMode>,
75    /// Configuration for sparse vector fields
76    pub sparse_vector_config: Option<SparseVectorConfig>,
77    /// Configuration for dense vector fields
78    pub dense_vector_config: Option<DenseVectorConfig>,
79    /// Whether this field has columnar fast-field storage
80    pub fast: bool,
81    /// Whether this field is a primary key (unique constraint)
82    pub primary: bool,
83}
84
85/// Parsed index definition
86#[derive(Debug, Clone)]
87pub struct IndexDef {
88    pub name: String,
89    pub fields: Vec<FieldDef>,
90    pub default_fields: Vec<String>,
91    /// Query router rules for routing queries to specific fields
92    pub query_routers: Vec<QueryRouterRule>,
93}
94
95impl IndexDef {
96    /// Convert to a Schema
97    pub fn to_schema(&self) -> Schema {
98        let mut builder = SchemaBuilder::default();
99
100        for field in &self.fields {
101            let f = match field.field_type {
102                FieldType::Text => {
103                    let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
104                    builder.add_text_field_with_tokenizer(
105                        &field.name,
106                        field.indexed,
107                        field.stored,
108                        tokenizer,
109                    )
110                }
111                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
112                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
113                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
114                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
115                FieldType::Json => builder.add_json_field(&field.name, field.stored),
116                FieldType::SparseVector => {
117                    if let Some(config) = &field.sparse_vector_config {
118                        builder.add_sparse_vector_field_with_config(
119                            &field.name,
120                            field.indexed,
121                            field.stored,
122                            config.clone(),
123                        )
124                    } else {
125                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
126                    }
127                }
128                FieldType::DenseVector => {
129                    // Dense vector dimension must be specified via config
130                    let config = field
131                        .dense_vector_config
132                        .as_ref()
133                        .expect("DenseVector field requires dimension to be specified");
134                    builder.add_dense_vector_field_with_config(
135                        &field.name,
136                        field.indexed,
137                        field.stored,
138                        config.clone(),
139                    )
140                }
141            };
142            if field.multi {
143                builder.set_multi(f, true);
144            }
145            if field.fast {
146                builder.set_fast(f, true);
147            }
148            if field.primary {
149                builder.set_primary_key(f);
150            }
151            // Set positions: explicit > auto (ordinal for multi vectors)
152            let positions = field.positions.or({
153                // Auto-set ordinal positions for multi-valued vector fields
154                if field.multi
155                    && matches!(
156                        field.field_type,
157                        FieldType::SparseVector | FieldType::DenseVector
158                    )
159                {
160                    Some(super::schema::PositionMode::Ordinal)
161                } else {
162                    None
163                }
164            });
165            if let Some(mode) = positions {
166                builder.set_positions(f, mode);
167            }
168        }
169
170        // Set default fields if specified
171        if !self.default_fields.is_empty() {
172            builder.set_default_fields(self.default_fields.clone());
173        }
174
175        // Set query routers if specified
176        if !self.query_routers.is_empty() {
177            builder.set_query_routers(self.query_routers.clone());
178        }
179
180        builder.build()
181    }
182
183    /// Create a QueryFieldRouter from the query router rules
184    ///
185    /// Returns None if there are no query router rules defined.
186    /// Returns Err if any regex pattern is invalid.
187    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
188        if self.query_routers.is_empty() {
189            return Ok(None);
190        }
191
192        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
193            .map(Some)
194            .map_err(Error::Schema)
195    }
196}
197
198/// Parse field type from string
199fn parse_field_type(type_str: &str) -> Result<FieldType> {
200    match type_str {
201        "text" | "string" | "str" => Ok(FieldType::Text),
202        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
203        "i64" | "int" | "integer" => Ok(FieldType::I64),
204        "f64" | "float" | "double" => Ok(FieldType::F64),
205        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
206        "json" => Ok(FieldType::Json),
207        "sparse_vector" => Ok(FieldType::SparseVector),
208        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
209        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
210    }
211}
212
213/// Index configuration parsed from indexed<...> attribute
214#[derive(Debug, Clone, Default)]
215struct IndexConfig {
216    index_type: Option<super::schema::VectorIndexType>,
217    num_clusters: Option<usize>,
218    nprobe: Option<usize>,
219    build_threshold: Option<usize>,
220    // Sparse vector index params
221    quantization: Option<WeightQuantization>,
222    weight_threshold: Option<f32>,
223    block_size: Option<usize>,
224    pruning: Option<f32>,
225    // Sparse vector query-time config
226    query_tokenizer: Option<String>,
227    query_weighting: Option<QueryWeighting>,
228    query_weight_threshold: Option<f32>,
229    query_max_dims: Option<usize>,
230    query_pruning: Option<f32>,
231    // Position tracking mode for phrase queries
232    positions: Option<super::schema::PositionMode>,
233}
234
235/// Parse attributes from pest pair
236/// Returns (indexed, stored, multi, fast, primary, index_config)
237/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
238/// multi is now inside stored<multi>
239fn parse_attributes(
240    pair: pest::iterators::Pair<Rule>,
241) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
242    let mut indexed = false;
243    let mut stored = false;
244    let mut multi = false;
245    let mut fast = false;
246    let mut primary = false;
247    let mut index_config = None;
248
249    for attr in pair.into_inner() {
250        if attr.as_rule() == Rule::attribute {
251            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" | "fast" | "primary" }
252            let mut found_config = false;
253            for inner in attr.clone().into_inner() {
254                match inner.as_rule() {
255                    Rule::indexed_with_config => {
256                        indexed = true;
257                        index_config = Some(parse_index_config(inner));
258                        found_config = true;
259                        break;
260                    }
261                    Rule::stored_with_config => {
262                        stored = true;
263                        multi = true; // stored<multi>
264                        found_config = true;
265                        break;
266                    }
267                    _ => {}
268                }
269            }
270            if !found_config {
271                // Simple attribute
272                match attr.as_str() {
273                    "indexed" => indexed = true,
274                    "stored" => stored = true,
275                    "fast" => fast = true,
276                    "primary" => primary = true,
277                    _ => {}
278                }
279            }
280        }
281    }
282
283    (indexed, stored, multi, fast, primary, index_config)
284}
285
286/// Parse index configuration from indexed<...> attribute
287fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
288    let mut config = IndexConfig::default();
289
290    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
291    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
292    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
293
294    for inner in pair.into_inner() {
295        if inner.as_rule() == Rule::index_config_params {
296            for param in inner.into_inner() {
297                if param.as_rule() == Rule::index_config_param {
298                    for p in param.into_inner() {
299                        parse_single_index_config_param(&mut config, p);
300                    }
301                }
302            }
303        }
304    }
305
306    config
307}
308
309/// Parse a single index config parameter
310fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
311    use super::schema::VectorIndexType;
312
313    match p.as_rule() {
314        Rule::index_type_spec => {
315            config.index_type = Some(match p.as_str() {
316                "flat" => VectorIndexType::Flat,
317                "rabitq" => VectorIndexType::RaBitQ,
318                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
319                "scann" => VectorIndexType::ScaNN,
320                _ => VectorIndexType::RaBitQ,
321            });
322        }
323        Rule::index_type_kwarg => {
324            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
325            if let Some(t) = p.into_inner().next() {
326                config.index_type = Some(match t.as_str() {
327                    "flat" => VectorIndexType::Flat,
328                    "rabitq" => VectorIndexType::RaBitQ,
329                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
330                    "scann" => VectorIndexType::ScaNN,
331                    _ => VectorIndexType::RaBitQ,
332                });
333            }
334        }
335        Rule::num_clusters_kwarg => {
336            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
337            if let Some(n) = p.into_inner().next() {
338                config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
339                    log::warn!(
340                        "Invalid num_clusters value '{}', using default 256",
341                        n.as_str()
342                    );
343                    256
344                }));
345            }
346        }
347        Rule::build_threshold_kwarg => {
348            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
349            if let Some(n) = p.into_inner().next() {
350                config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
351                    log::warn!(
352                        "Invalid build_threshold value '{}', using default 10000",
353                        n.as_str()
354                    );
355                    10000
356                }));
357            }
358        }
359        Rule::nprobe_kwarg => {
360            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
361            if let Some(n) = p.into_inner().next() {
362                config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
363                    log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
364                    32
365                }));
366            }
367        }
368        Rule::quantization_kwarg => {
369            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
370            if let Some(q) = p.into_inner().next() {
371                config.quantization = Some(match q.as_str() {
372                    "float32" | "f32" => WeightQuantization::Float32,
373                    "float16" | "f16" => WeightQuantization::Float16,
374                    "uint8" | "u8" => WeightQuantization::UInt8,
375                    "uint4" | "u4" => WeightQuantization::UInt4,
376                    _ => WeightQuantization::default(),
377                });
378            }
379        }
380        Rule::weight_threshold_kwarg => {
381            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
382            if let Some(t) = p.into_inner().next() {
383                config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
384                    log::warn!(
385                        "Invalid weight_threshold value '{}', using default 0.0",
386                        t.as_str()
387                    );
388                    0.0
389                }));
390            }
391        }
392        Rule::block_size_kwarg => {
393            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
394            if let Some(n) = p.into_inner().next() {
395                config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
396                    log::warn!(
397                        "Invalid block_size value '{}', using default 128",
398                        n.as_str()
399                    );
400                    128
401                }));
402            }
403        }
404        Rule::pruning_kwarg => {
405            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
406            if let Some(f) = p.into_inner().next() {
407                config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
408                    log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
409                    1.0
410                }));
411            }
412        }
413        Rule::query_config_block => {
414            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
415            parse_query_config_block(config, p);
416        }
417        Rule::positions_kwarg => {
418            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
419            use super::schema::PositionMode;
420            config.positions = Some(match p.as_str() {
421                "ordinal" => PositionMode::Ordinal,
422                "token_position" => PositionMode::TokenPosition,
423                _ => PositionMode::Full, // "positions" or any other value defaults to Full
424            });
425        }
426        _ => {}
427    }
428}
429
430/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
431fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
432    for inner in pair.into_inner() {
433        if inner.as_rule() == Rule::query_config_params {
434            for param in inner.into_inner() {
435                if param.as_rule() == Rule::query_config_param {
436                    for p in param.into_inner() {
437                        match p.as_rule() {
438                            Rule::query_tokenizer_kwarg => {
439                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
440                                if let Some(path) = p.into_inner().next()
441                                    && let Some(inner_path) = path.into_inner().next()
442                                {
443                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
444                                }
445                            }
446                            Rule::query_weighting_kwarg => {
447                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
448                                if let Some(w) = p.into_inner().next() {
449                                    config.query_weighting = Some(match w.as_str() {
450                                        "one" => QueryWeighting::One,
451                                        "idf" => QueryWeighting::Idf,
452                                        "idf_file" => QueryWeighting::IdfFile,
453                                        _ => QueryWeighting::One,
454                                    });
455                                }
456                            }
457                            Rule::query_weight_threshold_kwarg => {
458                                if let Some(t) = p.into_inner().next() {
459                                    config.query_weight_threshold =
460                                        Some(t.as_str().parse().unwrap_or_else(|_| {
461                                            log::warn!(
462                                                "Invalid query weight_threshold '{}', using 0.0",
463                                                t.as_str()
464                                            );
465                                            0.0
466                                        }));
467                                }
468                            }
469                            Rule::query_max_dims_kwarg => {
470                                if let Some(t) = p.into_inner().next() {
471                                    config.query_max_dims =
472                                        Some(t.as_str().parse().unwrap_or_else(|_| {
473                                            log::warn!(
474                                                "Invalid query max_dims '{}', using 0",
475                                                t.as_str()
476                                            );
477                                            0
478                                        }));
479                                }
480                            }
481                            Rule::query_pruning_kwarg => {
482                                if let Some(t) = p.into_inner().next() {
483                                    config.query_pruning =
484                                        Some(t.as_str().parse().unwrap_or_else(|_| {
485                                            log::warn!(
486                                                "Invalid query pruning '{}', using 1.0",
487                                                t.as_str()
488                                            );
489                                            1.0
490                                        }));
491                                }
492                            }
493                            _ => {}
494                        }
495                    }
496                }
497            }
498        }
499    }
500}
501
502/// Parse a field definition from pest pair
503fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
504    let mut inner = pair.into_inner();
505
506    let name = inner
507        .next()
508        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
509        .as_str()
510        .to_string();
511
512    let field_type_str = inner
513        .next()
514        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
515        .as_str();
516
517    let field_type = parse_field_type(field_type_str)?;
518
519    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
520    let mut tokenizer = None;
521    let mut sparse_vector_config = None;
522    let mut dense_vector_config = None;
523    let mut indexed = true;
524    let mut stored = true;
525    let mut multi = false;
526    let mut fast = false;
527    let mut primary = false;
528    let mut index_config: Option<IndexConfig> = None;
529
530    for item in inner {
531        match item.as_rule() {
532            Rule::tokenizer_spec => {
533                // Extract tokenizer name from <name>
534                if let Some(tok_name) = item.into_inner().next() {
535                    tokenizer = Some(tok_name.as_str().to_string());
536                }
537            }
538            Rule::sparse_vector_config => {
539                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
540                sparse_vector_config = Some(parse_sparse_vector_config(item));
541            }
542            Rule::dense_vector_config => {
543                // Parse dense_vector_params (keyword or positional) - only dims
544                dense_vector_config = Some(parse_dense_vector_config(item));
545            }
546            Rule::attributes => {
547                let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
548                indexed = idx;
549                stored = sto;
550                multi = mul;
551                fast = fst;
552                primary = pri;
553                index_config = idx_cfg;
554            }
555            _ => {}
556        }
557    }
558
559    // Primary key implies fast + indexed (needed for dedup lookups)
560    if primary {
561        fast = true;
562        indexed = true;
563    }
564
565    // Merge index config into vector configs if both exist
566    let mut positions = None;
567    if let Some(idx_cfg) = index_config {
568        positions = idx_cfg.positions;
569        if let Some(ref mut dv_config) = dense_vector_config {
570            apply_index_config_to_dense_vector(dv_config, idx_cfg);
571        } else if field_type == FieldType::SparseVector {
572            // For sparse vectors, create default config if not present and apply index params
573            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
574            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
575        }
576    }
577
578    Ok(FieldDef {
579        name,
580        field_type,
581        indexed,
582        stored,
583        tokenizer,
584        multi,
585        positions,
586        sparse_vector_config,
587        dense_vector_config,
588        fast,
589        primary,
590    })
591}
592
593/// Apply index configuration from indexed<...> to DenseVectorConfig
594fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
595    // Apply index type if specified
596    if let Some(index_type) = idx_cfg.index_type {
597        config.index_type = index_type;
598    }
599
600    // Apply num_clusters for IVF-based indexes
601    if idx_cfg.num_clusters.is_some() {
602        config.num_clusters = idx_cfg.num_clusters;
603    }
604
605    // Apply nprobe if specified
606    if let Some(nprobe) = idx_cfg.nprobe {
607        config.nprobe = nprobe;
608    }
609
610    // Apply build_threshold if specified
611    if idx_cfg.build_threshold.is_some() {
612        config.build_threshold = idx_cfg.build_threshold;
613    }
614}
615
616/// Parse sparse_vector_config - only index_size (positional)
617/// Example: <u16> or <u32>
618fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
619    let mut index_size = IndexSize::default();
620
621    // Parse positional index_size_spec
622    for inner in pair.into_inner() {
623        if inner.as_rule() == Rule::index_size_spec {
624            index_size = match inner.as_str() {
625                "u16" => IndexSize::U16,
626                "u32" => IndexSize::U32,
627                _ => IndexSize::default(),
628            };
629        }
630    }
631
632    SparseVectorConfig {
633        format: crate::structures::SparseFormat::Bmp,
634        index_size,
635        weight_quantization: WeightQuantization::default(),
636        weight_threshold: 0.0,
637        block_size: 128,
638        bmp_block_size: 64,
639        max_bmp_grid_bytes: 0,
640        bmp_superblock_size: 64,
641        pruning: None,
642        query_config: None,
643    }
644}
645
646/// Apply index configuration from indexed<...> to SparseVectorConfig
647fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
648    if let Some(q) = idx_cfg.quantization {
649        config.weight_quantization = q;
650    }
651    if let Some(t) = idx_cfg.weight_threshold {
652        config.weight_threshold = t;
653    }
654    if let Some(bs) = idx_cfg.block_size {
655        let adjusted = bs.next_power_of_two();
656        if adjusted != bs {
657            log::warn!(
658                "block_size {} adjusted to next power of two: {}",
659                bs,
660                adjusted
661            );
662        }
663        config.block_size = adjusted;
664    }
665    if let Some(p) = idx_cfg.pruning {
666        let clamped = p.clamp(0.0, 1.0);
667        if (clamped - p).abs() > f32::EPSILON {
668            log::warn!(
669                "pruning {} clamped to valid range [0.0, 1.0]: {}",
670                p,
671                clamped
672            );
673        }
674        config.pruning = Some(clamped);
675    }
676    // Apply query-time configuration if present
677    if idx_cfg.query_tokenizer.is_some()
678        || idx_cfg.query_weighting.is_some()
679        || idx_cfg.query_weight_threshold.is_some()
680        || idx_cfg.query_max_dims.is_some()
681        || idx_cfg.query_pruning.is_some()
682    {
683        let query_config = config
684            .query_config
685            .get_or_insert(SparseQueryConfig::default());
686        if let Some(tokenizer) = idx_cfg.query_tokenizer {
687            query_config.tokenizer = Some(tokenizer);
688        }
689        if let Some(weighting) = idx_cfg.query_weighting {
690            query_config.weighting = weighting;
691        }
692        if let Some(t) = idx_cfg.query_weight_threshold {
693            query_config.weight_threshold = t;
694        }
695        if let Some(d) = idx_cfg.query_max_dims {
696            query_config.max_query_dims = Some(d);
697        }
698        if let Some(p) = idx_cfg.query_pruning {
699            query_config.pruning = Some(p);
700        }
701    }
702}
703
704/// Parse dense_vector_config - dims and optional quantization type
705/// All index-related params are in indexed<...> attribute
706fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
707    let mut dim: usize = 0;
708    let mut quantization = DenseVectorQuantization::F32;
709
710    // Navigate to dense_vector_params
711    for params in pair.into_inner() {
712        if params.as_rule() == Rule::dense_vector_params {
713            for inner in params.into_inner() {
714                match inner.as_rule() {
715                    Rule::dense_vector_keyword_params => {
716                        for kwarg in inner.into_inner() {
717                            match kwarg.as_rule() {
718                                Rule::dims_kwarg => {
719                                    if let Some(d) = kwarg.into_inner().next() {
720                                        dim = d.as_str().parse().unwrap_or(0);
721                                    }
722                                }
723                                Rule::quant_type_spec => {
724                                    quantization = parse_quant_type(kwarg.as_str());
725                                }
726                                _ => {}
727                            }
728                        }
729                    }
730                    Rule::dense_vector_positional_params => {
731                        for item in inner.into_inner() {
732                            match item.as_rule() {
733                                Rule::dimension_spec => {
734                                    dim = item.as_str().parse().unwrap_or(0);
735                                }
736                                Rule::quant_type_spec => {
737                                    quantization = parse_quant_type(item.as_str());
738                                }
739                                _ => {}
740                            }
741                        }
742                    }
743                    _ => {}
744                }
745            }
746        }
747    }
748
749    DenseVectorConfig::new(dim).with_quantization(quantization)
750}
751
752fn parse_quant_type(s: &str) -> DenseVectorQuantization {
753    match s.trim() {
754        "f16" => DenseVectorQuantization::F16,
755        "uint8" | "u8" => DenseVectorQuantization::UInt8,
756        _ => DenseVectorQuantization::F32,
757    }
758}
759
760/// Parse default_fields definition
761fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
762    pair.into_inner().map(|p| p.as_str().to_string()).collect()
763}
764
765/// Parse a query router definition
766fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
767    let mut pattern = String::new();
768    let mut substitution = String::new();
769    let mut target_field = String::new();
770    let mut mode = RoutingMode::Additional;
771
772    for prop in pair.into_inner() {
773        if prop.as_rule() != Rule::query_router_prop {
774            continue;
775        }
776
777        for inner in prop.into_inner() {
778            match inner.as_rule() {
779                Rule::query_router_pattern => {
780                    if let Some(regex_str) = inner.into_inner().next() {
781                        pattern = parse_string_value(regex_str);
782                    }
783                }
784                Rule::query_router_substitution => {
785                    if let Some(quoted) = inner.into_inner().next() {
786                        substitution = parse_string_value(quoted);
787                    }
788                }
789                Rule::query_router_target => {
790                    if let Some(ident) = inner.into_inner().next() {
791                        target_field = ident.as_str().to_string();
792                    }
793                }
794                Rule::query_router_mode => {
795                    if let Some(mode_val) = inner.into_inner().next() {
796                        mode = match mode_val.as_str() {
797                            "exclusive" => RoutingMode::Exclusive,
798                            "additional" => RoutingMode::Additional,
799                            _ => RoutingMode::Additional,
800                        };
801                    }
802                }
803                _ => {}
804            }
805        }
806    }
807
808    if pattern.is_empty() {
809        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
810    }
811    if substitution.is_empty() {
812        return Err(Error::Schema(
813            "query_router missing 'substitution'".to_string(),
814        ));
815    }
816    if target_field.is_empty() {
817        return Err(Error::Schema(
818            "query_router missing 'target_field'".to_string(),
819        ));
820    }
821
822    Ok(QueryRouterRule {
823        pattern,
824        substitution,
825        target_field,
826        mode,
827    })
828}
829
830/// Parse a string value from quoted_string, raw_string, or regex_string
831fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
832    let s = pair.as_str();
833    match pair.as_rule() {
834        Rule::regex_string => {
835            // regex_string contains either raw_string or quoted_string
836            if let Some(inner) = pair.into_inner().next() {
837                parse_string_value(inner)
838            } else {
839                s.to_string()
840            }
841        }
842        Rule::raw_string => {
843            // r"..." - strip r" prefix and " suffix
844            s[2..s.len() - 1].to_string()
845        }
846        Rule::quoted_string => {
847            // "..." - strip quotes and handle escapes
848            let inner = &s[1..s.len() - 1];
849            // Simple escape handling
850            inner
851                .replace("\\n", "\n")
852                .replace("\\t", "\t")
853                .replace("\\\"", "\"")
854                .replace("\\\\", "\\")
855        }
856        _ => s.to_string(),
857    }
858}
859
860/// Parse an index definition from pest pair
861fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
862    let mut inner = pair.into_inner();
863
864    let name = inner
865        .next()
866        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
867        .as_str()
868        .to_string();
869
870    let mut fields = Vec::new();
871    let mut default_fields = Vec::new();
872    let mut query_routers = Vec::new();
873
874    for item in inner {
875        match item.as_rule() {
876            Rule::field_def => {
877                fields.push(parse_field_def(item)?);
878            }
879            Rule::default_fields_def => {
880                default_fields = parse_default_fields_def(item);
881            }
882            Rule::query_router_def => {
883                query_routers.push(parse_query_router_def(item)?);
884            }
885            _ => {}
886        }
887    }
888
889    // Validate primary key constraints
890    let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
891    if primary_fields.len() > 1 {
892        return Err(Error::Schema(format!(
893            "Index '{}' has {} primary key fields, but at most one is allowed",
894            name,
895            primary_fields.len()
896        )));
897    }
898    if let Some(pk) = primary_fields.first() {
899        if pk.field_type != FieldType::Text {
900            return Err(Error::Schema(format!(
901                "Primary key field '{}' must be of type text, got {:?}",
902                pk.name, pk.field_type
903            )));
904        }
905        if pk.multi {
906            return Err(Error::Schema(format!(
907                "Primary key field '{}' cannot be multi-valued",
908                pk.name
909            )));
910        }
911    }
912
913    Ok(IndexDef {
914        name,
915        fields,
916        default_fields,
917        query_routers,
918    })
919}
920
921/// Parse SDL from a string
922pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
923    let pairs = SdlParser::parse(Rule::file, input)
924        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
925
926    let mut indexes = Vec::new();
927
928    for pair in pairs {
929        if pair.as_rule() == Rule::file {
930            for inner in pair.into_inner() {
931                if inner.as_rule() == Rule::index_def {
932                    indexes.push(parse_index_def(inner)?);
933                }
934            }
935        }
936    }
937
938    Ok(indexes)
939}
940
941/// Parse SDL and return a single index definition
942pub fn parse_single_index(input: &str) -> Result<IndexDef> {
943    let indexes = parse_sdl(input)?;
944
945    if indexes.is_empty() {
946        return Err(Error::Schema("No index definition found".to_string()));
947    }
948
949    if indexes.len() > 1 {
950        return Err(Error::Schema(
951            "Multiple index definitions found, expected one".to_string(),
952        ));
953    }
954
955    Ok(indexes.into_iter().next().unwrap())
956}
957
958#[cfg(test)]
959mod tests {
960    use super::*;
961
962    #[test]
963    fn test_parse_simple_schema() {
964        let sdl = r#"
965            index articles {
966                field title: text [indexed, stored]
967                field body: text [indexed]
968            }
969        "#;
970
971        let indexes = parse_sdl(sdl).unwrap();
972        assert_eq!(indexes.len(), 1);
973
974        let index = &indexes[0];
975        assert_eq!(index.name, "articles");
976        assert_eq!(index.fields.len(), 2);
977
978        assert_eq!(index.fields[0].name, "title");
979        assert!(matches!(index.fields[0].field_type, FieldType::Text));
980        assert!(index.fields[0].indexed);
981        assert!(index.fields[0].stored);
982
983        assert_eq!(index.fields[1].name, "body");
984        assert!(matches!(index.fields[1].field_type, FieldType::Text));
985        assert!(index.fields[1].indexed);
986        assert!(!index.fields[1].stored);
987    }
988
989    #[test]
990    fn test_parse_all_field_types() {
991        let sdl = r#"
992            index test {
993                field text_field: text [indexed, stored]
994                field u64_field: u64 [indexed, stored]
995                field i64_field: i64 [indexed, stored]
996                field f64_field: f64 [indexed, stored]
997                field bytes_field: bytes [stored]
998            }
999        "#;
1000
1001        let indexes = parse_sdl(sdl).unwrap();
1002        let index = &indexes[0];
1003
1004        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1005        assert!(matches!(index.fields[1].field_type, FieldType::U64));
1006        assert!(matches!(index.fields[2].field_type, FieldType::I64));
1007        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1008        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1009    }
1010
1011    #[test]
1012    fn test_parse_with_comments() {
1013        let sdl = r#"
1014            # This is a comment
1015            index articles {
1016                # Title field
1017                field title: text [indexed, stored]
1018                field body: text [indexed] # inline comment not supported yet
1019            }
1020        "#;
1021
1022        let indexes = parse_sdl(sdl).unwrap();
1023        assert_eq!(indexes[0].fields.len(), 2);
1024    }
1025
1026    #[test]
1027    fn test_parse_type_aliases() {
1028        let sdl = r#"
1029            index test {
1030                field a: string [indexed]
1031                field b: int [indexed]
1032                field c: uint [indexed]
1033                field d: float [indexed]
1034                field e: binary [stored]
1035            }
1036        "#;
1037
1038        let indexes = parse_sdl(sdl).unwrap();
1039        let index = &indexes[0];
1040
1041        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1042        assert!(matches!(index.fields[1].field_type, FieldType::I64));
1043        assert!(matches!(index.fields[2].field_type, FieldType::U64));
1044        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1045        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1046    }
1047
1048    #[test]
1049    fn test_to_schema() {
1050        let sdl = r#"
1051            index articles {
1052                field title: text [indexed, stored]
1053                field views: u64 [indexed, stored]
1054            }
1055        "#;
1056
1057        let indexes = parse_sdl(sdl).unwrap();
1058        let schema = indexes[0].to_schema();
1059
1060        assert!(schema.get_field("title").is_some());
1061        assert!(schema.get_field("views").is_some());
1062        assert!(schema.get_field("nonexistent").is_none());
1063    }
1064
1065    #[test]
1066    fn test_default_attributes() {
1067        let sdl = r#"
1068            index test {
1069                field title: text
1070            }
1071        "#;
1072
1073        let indexes = parse_sdl(sdl).unwrap();
1074        let field = &indexes[0].fields[0];
1075
1076        // Default should be indexed and stored
1077        assert!(field.indexed);
1078        assert!(field.stored);
1079    }
1080
1081    #[test]
1082    fn test_multiple_indexes() {
1083        let sdl = r#"
1084            index articles {
1085                field title: text [indexed, stored]
1086            }
1087
1088            index users {
1089                field name: text [indexed, stored]
1090                field email: text [indexed, stored]
1091            }
1092        "#;
1093
1094        let indexes = parse_sdl(sdl).unwrap();
1095        assert_eq!(indexes.len(), 2);
1096        assert_eq!(indexes[0].name, "articles");
1097        assert_eq!(indexes[1].name, "users");
1098    }
1099
1100    #[test]
1101    fn test_tokenizer_spec() {
1102        let sdl = r#"
1103            index articles {
1104                field title: text<en_stem> [indexed, stored]
1105                field body: text<simple> [indexed]
1106                field author: text [indexed, stored]
1107            }
1108        "#;
1109
1110        let indexes = parse_sdl(sdl).unwrap();
1111        let index = &indexes[0];
1112
1113        assert_eq!(index.fields[0].name, "title");
1114        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1115
1116        assert_eq!(index.fields[1].name, "body");
1117        assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1118
1119        assert_eq!(index.fields[2].name, "author");
1120        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1121    }
1122
1123    #[test]
1124    fn test_tokenizer_in_schema() {
1125        let sdl = r#"
1126            index articles {
1127                field title: text<german> [indexed, stored]
1128                field body: text<en_stem> [indexed]
1129            }
1130        "#;
1131
1132        let indexes = parse_sdl(sdl).unwrap();
1133        let schema = indexes[0].to_schema();
1134
1135        let title_field = schema.get_field("title").unwrap();
1136        let title_entry = schema.get_field_entry(title_field).unwrap();
1137        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1138
1139        let body_field = schema.get_field("body").unwrap();
1140        let body_entry = schema.get_field_entry(body_field).unwrap();
1141        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1142    }
1143
1144    #[test]
1145    fn test_query_router_basic() {
1146        let sdl = r#"
1147            index documents {
1148                field title: text [indexed, stored]
1149                field uri: text [indexed, stored]
1150
1151                query_router {
1152                    pattern: "10\\.\\d{4,}/[^\\s]+"
1153                    substitution: "doi://{0}"
1154                    target_field: uris
1155                    mode: exclusive
1156                }
1157            }
1158        "#;
1159
1160        let indexes = parse_sdl(sdl).unwrap();
1161        let index = &indexes[0];
1162
1163        assert_eq!(index.query_routers.len(), 1);
1164        let router = &index.query_routers[0];
1165        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1166        assert_eq!(router.substitution, "doi://{0}");
1167        assert_eq!(router.target_field, "uris");
1168        assert_eq!(router.mode, RoutingMode::Exclusive);
1169    }
1170
1171    #[test]
1172    fn test_query_router_raw_string() {
1173        let sdl = r#"
1174            index documents {
1175                field uris: text [indexed, stored]
1176
1177                query_router {
1178                    pattern: r"^pmid:(\d+)$"
1179                    substitution: "pubmed://{1}"
1180                    target_field: uris
1181                    mode: additional
1182                }
1183            }
1184        "#;
1185
1186        let indexes = parse_sdl(sdl).unwrap();
1187        let router = &indexes[0].query_routers[0];
1188
1189        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1190        assert_eq!(router.substitution, "pubmed://{1}");
1191        assert_eq!(router.mode, RoutingMode::Additional);
1192    }
1193
1194    #[test]
1195    fn test_multiple_query_routers() {
1196        let sdl = r#"
1197            index documents {
1198                field uris: text [indexed, stored]
1199
1200                query_router {
1201                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1202                    substitution: "doi://{1}"
1203                    target_field: uris
1204                    mode: exclusive
1205                }
1206
1207                query_router {
1208                    pattern: r"^pmid:(\d+)$"
1209                    substitution: "pubmed://{1}"
1210                    target_field: uris
1211                    mode: exclusive
1212                }
1213
1214                query_router {
1215                    pattern: r"^arxiv:(\d+\.\d+)$"
1216                    substitution: "arxiv://{1}"
1217                    target_field: uris
1218                    mode: additional
1219                }
1220            }
1221        "#;
1222
1223        let indexes = parse_sdl(sdl).unwrap();
1224        assert_eq!(indexes[0].query_routers.len(), 3);
1225    }
1226
1227    #[test]
1228    fn test_query_router_default_mode() {
1229        let sdl = r#"
1230            index documents {
1231                field uris: text [indexed, stored]
1232
1233                query_router {
1234                    pattern: r"test"
1235                    substitution: "{0}"
1236                    target_field: uris
1237                }
1238            }
1239        "#;
1240
1241        let indexes = parse_sdl(sdl).unwrap();
1242        // Default mode should be Additional
1243        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1244    }
1245
1246    #[test]
1247    fn test_multi_attribute() {
1248        let sdl = r#"
1249            index documents {
1250                field uris: text [indexed, stored<multi>]
1251                field title: text [indexed, stored]
1252            }
1253        "#;
1254
1255        let indexes = parse_sdl(sdl).unwrap();
1256        assert_eq!(indexes.len(), 1);
1257
1258        let fields = &indexes[0].fields;
1259        assert_eq!(fields.len(), 2);
1260
1261        // uris should have multi=true
1262        assert_eq!(fields[0].name, "uris");
1263        assert!(fields[0].multi, "uris field should have multi=true");
1264
1265        // title should have multi=false
1266        assert_eq!(fields[1].name, "title");
1267        assert!(!fields[1].multi, "title field should have multi=false");
1268
1269        // Verify schema conversion preserves multi attribute
1270        let schema = indexes[0].to_schema();
1271        let uris_field = schema.get_field("uris").unwrap();
1272        let title_field = schema.get_field("title").unwrap();
1273
1274        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1275        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1276    }
1277
1278    #[test]
1279    fn test_sparse_vector_field() {
1280        let sdl = r#"
1281            index documents {
1282                field embedding: sparse_vector [indexed, stored]
1283            }
1284        "#;
1285
1286        let indexes = parse_sdl(sdl).unwrap();
1287        assert_eq!(indexes.len(), 1);
1288        assert_eq!(indexes[0].fields.len(), 1);
1289        assert_eq!(indexes[0].fields[0].name, "embedding");
1290        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1291        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1292    }
1293
1294    #[test]
1295    fn test_sparse_vector_with_config() {
1296        let sdl = r#"
1297            index documents {
1298                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1299                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1300            }
1301        "#;
1302
1303        let indexes = parse_sdl(sdl).unwrap();
1304        assert_eq!(indexes[0].fields.len(), 2);
1305
1306        // First field: u16 indices, uint8 quantization
1307        let f1 = &indexes[0].fields[0];
1308        assert_eq!(f1.name, "embedding");
1309        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1310        assert_eq!(config1.index_size, IndexSize::U16);
1311        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1312
1313        // Second field: u32 indices, float32 quantization
1314        let f2 = &indexes[0].fields[1];
1315        assert_eq!(f2.name, "dense");
1316        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1317        assert_eq!(config2.index_size, IndexSize::U32);
1318        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1319    }
1320
1321    #[test]
1322    fn test_sparse_vector_with_weight_threshold() {
1323        let sdl = r#"
1324            index documents {
1325                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1326                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1327            }
1328        "#;
1329
1330        let indexes = parse_sdl(sdl).unwrap();
1331        assert_eq!(indexes[0].fields.len(), 2);
1332
1333        // First field: u16 indices, uint8 quantization, threshold 0.1
1334        let f1 = &indexes[0].fields[0];
1335        assert_eq!(f1.name, "embedding");
1336        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1337        assert_eq!(config1.index_size, IndexSize::U16);
1338        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1339        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1340
1341        // Second field: u32 indices, float16 quantization, threshold 0.05
1342        let f2 = &indexes[0].fields[1];
1343        assert_eq!(f2.name, "embedding2");
1344        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1345        assert_eq!(config2.index_size, IndexSize::U32);
1346        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1347        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1348    }
1349
1350    #[test]
1351    fn test_sparse_vector_with_pruning() {
1352        let sdl = r#"
1353            index documents {
1354                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1355            }
1356        "#;
1357
1358        let indexes = parse_sdl(sdl).unwrap();
1359        let f = &indexes[0].fields[0];
1360        assert_eq!(f.name, "embedding");
1361        let config = f.sparse_vector_config.as_ref().unwrap();
1362        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1363        assert_eq!(config.pruning, Some(0.1));
1364    }
1365
1366    #[test]
1367    fn test_dense_vector_field() {
1368        let sdl = r#"
1369            index documents {
1370                field embedding: dense_vector<768> [indexed, stored]
1371            }
1372        "#;
1373
1374        let indexes = parse_sdl(sdl).unwrap();
1375        assert_eq!(indexes.len(), 1);
1376        assert_eq!(indexes[0].fields.len(), 1);
1377
1378        let f = &indexes[0].fields[0];
1379        assert_eq!(f.name, "embedding");
1380        assert_eq!(f.field_type, FieldType::DenseVector);
1381
1382        let config = f.dense_vector_config.as_ref().unwrap();
1383        assert_eq!(config.dim, 768);
1384    }
1385
1386    #[test]
1387    fn test_dense_vector_alias() {
1388        let sdl = r#"
1389            index documents {
1390                field embedding: vector<1536> [indexed]
1391            }
1392        "#;
1393
1394        let indexes = parse_sdl(sdl).unwrap();
1395        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1396        assert_eq!(
1397            indexes[0].fields[0]
1398                .dense_vector_config
1399                .as_ref()
1400                .unwrap()
1401                .dim,
1402            1536
1403        );
1404    }
1405
1406    #[test]
1407    fn test_dense_vector_with_num_clusters() {
1408        let sdl = r#"
1409            index documents {
1410                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1411            }
1412        "#;
1413
1414        let indexes = parse_sdl(sdl).unwrap();
1415        assert_eq!(indexes.len(), 1);
1416
1417        let f = &indexes[0].fields[0];
1418        assert_eq!(f.name, "embedding");
1419        assert_eq!(f.field_type, FieldType::DenseVector);
1420
1421        let config = f.dense_vector_config.as_ref().unwrap();
1422        assert_eq!(config.dim, 768);
1423        assert_eq!(config.num_clusters, Some(256));
1424        assert_eq!(config.nprobe, 32); // default
1425    }
1426
1427    #[test]
1428    fn test_dense_vector_with_num_clusters_and_nprobe() {
1429        let sdl = r#"
1430            index documents {
1431                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1432            }
1433        "#;
1434
1435        let indexes = parse_sdl(sdl).unwrap();
1436        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1437
1438        assert_eq!(config.dim, 1536);
1439        assert_eq!(config.num_clusters, Some(512));
1440        assert_eq!(config.nprobe, 64);
1441    }
1442
1443    #[test]
1444    fn test_dense_vector_keyword_syntax() {
1445        let sdl = r#"
1446            index documents {
1447                field embedding: dense_vector<dims: 1536> [indexed, stored]
1448            }
1449        "#;
1450
1451        let indexes = parse_sdl(sdl).unwrap();
1452        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1453
1454        assert_eq!(config.dim, 1536);
1455        assert!(config.num_clusters.is_none());
1456    }
1457
1458    #[test]
1459    fn test_dense_vector_keyword_syntax_full() {
1460        let sdl = r#"
1461            index documents {
1462                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1463            }
1464        "#;
1465
1466        let indexes = parse_sdl(sdl).unwrap();
1467        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1468
1469        assert_eq!(config.dim, 1536);
1470        assert_eq!(config.num_clusters, Some(256));
1471        assert_eq!(config.nprobe, 64);
1472    }
1473
1474    #[test]
1475    fn test_dense_vector_keyword_syntax_partial() {
1476        let sdl = r#"
1477            index documents {
1478                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1479            }
1480        "#;
1481
1482        let indexes = parse_sdl(sdl).unwrap();
1483        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1484
1485        assert_eq!(config.dim, 768);
1486        assert_eq!(config.num_clusters, Some(128));
1487        assert_eq!(config.nprobe, 32); // default
1488    }
1489
1490    #[test]
1491    fn test_dense_vector_scann_index() {
1492        use crate::dsl::schema::VectorIndexType;
1493
1494        let sdl = r#"
1495            index documents {
1496                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1497            }
1498        "#;
1499
1500        let indexes = parse_sdl(sdl).unwrap();
1501        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1502
1503        assert_eq!(config.dim, 768);
1504        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1505        assert_eq!(config.num_clusters, Some(256));
1506        assert_eq!(config.nprobe, 64);
1507    }
1508
1509    #[test]
1510    fn test_dense_vector_ivf_rabitq_index() {
1511        use crate::dsl::schema::VectorIndexType;
1512
1513        let sdl = r#"
1514            index documents {
1515                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1516            }
1517        "#;
1518
1519        let indexes = parse_sdl(sdl).unwrap();
1520        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1521
1522        assert_eq!(config.dim, 1536);
1523        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1524        assert_eq!(config.num_clusters, Some(512));
1525    }
1526
1527    #[test]
1528    fn test_dense_vector_rabitq_no_clusters() {
1529        use crate::dsl::schema::VectorIndexType;
1530
1531        let sdl = r#"
1532            index documents {
1533                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1534            }
1535        "#;
1536
1537        let indexes = parse_sdl(sdl).unwrap();
1538        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1539
1540        assert_eq!(config.dim, 768);
1541        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1542        assert!(config.num_clusters.is_none());
1543    }
1544
1545    #[test]
1546    fn test_dense_vector_flat_index() {
1547        use crate::dsl::schema::VectorIndexType;
1548
1549        let sdl = r#"
1550            index documents {
1551                field embedding: dense_vector<dims: 768> [indexed<flat>]
1552            }
1553        "#;
1554
1555        let indexes = parse_sdl(sdl).unwrap();
1556        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1557
1558        assert_eq!(config.dim, 768);
1559        assert_eq!(config.index_type, VectorIndexType::Flat);
1560    }
1561
1562    #[test]
1563    fn test_dense_vector_default_index_type() {
1564        use crate::dsl::schema::VectorIndexType;
1565
1566        // When no index type specified, should default to RaBitQ (basic)
1567        let sdl = r#"
1568            index documents {
1569                field embedding: dense_vector<dims: 768> [indexed]
1570            }
1571        "#;
1572
1573        let indexes = parse_sdl(sdl).unwrap();
1574        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1575
1576        assert_eq!(config.dim, 768);
1577        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1578    }
1579
1580    #[test]
1581    fn test_dense_vector_f16_quantization() {
1582        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1583
1584        let sdl = r#"
1585            index documents {
1586                field embedding: dense_vector<768, f16> [indexed]
1587            }
1588        "#;
1589
1590        let indexes = parse_sdl(sdl).unwrap();
1591        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1592
1593        assert_eq!(config.dim, 768);
1594        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1595        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1596    }
1597
1598    #[test]
1599    fn test_dense_vector_uint8_quantization() {
1600        use crate::dsl::schema::DenseVectorQuantization;
1601
1602        let sdl = r#"
1603            index documents {
1604                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1605            }
1606        "#;
1607
1608        let indexes = parse_sdl(sdl).unwrap();
1609        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1610
1611        assert_eq!(config.dim, 1024);
1612        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1613    }
1614
1615    #[test]
1616    fn test_dense_vector_u8_alias() {
1617        use crate::dsl::schema::DenseVectorQuantization;
1618
1619        let sdl = r#"
1620            index documents {
1621                field embedding: dense_vector<512, u8> [indexed]
1622            }
1623        "#;
1624
1625        let indexes = parse_sdl(sdl).unwrap();
1626        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1627
1628        assert_eq!(config.dim, 512);
1629        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1630    }
1631
1632    #[test]
1633    fn test_dense_vector_default_f32_quantization() {
1634        use crate::dsl::schema::DenseVectorQuantization;
1635
1636        // No quantization type → default f32
1637        let sdl = r#"
1638            index documents {
1639                field embedding: dense_vector<768> [indexed]
1640            }
1641        "#;
1642
1643        let indexes = parse_sdl(sdl).unwrap();
1644        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1645
1646        assert_eq!(config.dim, 768);
1647        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1648    }
1649
1650    #[test]
1651    fn test_dense_vector_keyword_with_quantization() {
1652        use crate::dsl::schema::DenseVectorQuantization;
1653
1654        let sdl = r#"
1655            index documents {
1656                field embedding: dense_vector<dims: 768, f16> [indexed]
1657            }
1658        "#;
1659
1660        let indexes = parse_sdl(sdl).unwrap();
1661        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1662
1663        assert_eq!(config.dim, 768);
1664        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1665    }
1666
1667    #[test]
1668    fn test_json_field_type() {
1669        let sdl = r#"
1670            index documents {
1671                field title: text [indexed, stored]
1672                field metadata: json [stored]
1673                field extra: json
1674            }
1675        "#;
1676
1677        let indexes = parse_sdl(sdl).unwrap();
1678        let index = &indexes[0];
1679
1680        assert_eq!(index.fields.len(), 3);
1681
1682        // Check JSON field
1683        assert_eq!(index.fields[1].name, "metadata");
1684        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1685        assert!(index.fields[1].stored);
1686        // JSON fields should not be indexed (enforced by add_json_field)
1687
1688        // Check default attributes for JSON field
1689        assert_eq!(index.fields[2].name, "extra");
1690        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1691
1692        // Verify schema conversion
1693        let schema = index.to_schema();
1694        let metadata_field = schema.get_field("metadata").unwrap();
1695        let entry = schema.get_field_entry(metadata_field).unwrap();
1696        assert_eq!(entry.field_type, FieldType::Json);
1697        assert!(!entry.indexed); // JSON fields are never indexed
1698        assert!(entry.stored);
1699    }
1700
1701    #[test]
1702    fn test_sparse_vector_query_config() {
1703        use crate::structures::QueryWeighting;
1704
1705        let sdl = r#"
1706            index documents {
1707                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1708            }
1709        "#;
1710
1711        let indexes = parse_sdl(sdl).unwrap();
1712        let index = &indexes[0];
1713
1714        assert_eq!(index.fields.len(), 1);
1715        assert_eq!(index.fields[0].name, "embedding");
1716        assert!(matches!(
1717            index.fields[0].field_type,
1718            FieldType::SparseVector
1719        ));
1720
1721        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1722        assert_eq!(config.index_size, IndexSize::U16);
1723        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1724
1725        // Check query config
1726        let query_config = config.query_config.as_ref().unwrap();
1727        assert_eq!(
1728            query_config.tokenizer.as_deref(),
1729            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1730        );
1731        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1732
1733        // Verify schema conversion preserves query config
1734        let schema = index.to_schema();
1735        let embedding_field = schema.get_field("embedding").unwrap();
1736        let entry = schema.get_field_entry(embedding_field).unwrap();
1737        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1738        let qc = sv_config.query_config.as_ref().unwrap();
1739        assert_eq!(
1740            qc.tokenizer.as_deref(),
1741            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1742        );
1743        assert_eq!(qc.weighting, QueryWeighting::Idf);
1744    }
1745
1746    #[test]
1747    fn test_sparse_vector_query_config_weighting_one() {
1748        use crate::structures::QueryWeighting;
1749
1750        let sdl = r#"
1751            index documents {
1752                field embedding: sparse_vector [indexed<query<weighting: one>>]
1753            }
1754        "#;
1755
1756        let indexes = parse_sdl(sdl).unwrap();
1757        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1758
1759        let query_config = config.query_config.as_ref().unwrap();
1760        assert!(query_config.tokenizer.is_none());
1761        assert_eq!(query_config.weighting, QueryWeighting::One);
1762    }
1763
1764    #[test]
1765    fn test_sparse_vector_query_config_weighting_idf_file() {
1766        use crate::structures::QueryWeighting;
1767
1768        let sdl = r#"
1769            index documents {
1770                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1771            }
1772        "#;
1773
1774        let indexes = parse_sdl(sdl).unwrap();
1775        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1776
1777        let query_config = config.query_config.as_ref().unwrap();
1778        assert_eq!(
1779            query_config.tokenizer.as_deref(),
1780            Some("opensearch-neural-sparse-encoding-v1")
1781        );
1782        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1783
1784        // Verify schema conversion preserves idf_file
1785        let schema = indexes[0].to_schema();
1786        let field = schema.get_field("embedding").unwrap();
1787        let entry = schema.get_field_entry(field).unwrap();
1788        let sc = entry.sparse_vector_config.as_ref().unwrap();
1789        let qc = sc.query_config.as_ref().unwrap();
1790        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1791    }
1792
1793    #[test]
1794    fn test_sparse_vector_query_config_pruning_params() {
1795        let sdl = r#"
1796            index documents {
1797                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1798            }
1799        "#;
1800
1801        let indexes = parse_sdl(sdl).unwrap();
1802        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1803
1804        let qc = config.query_config.as_ref().unwrap();
1805        assert_eq!(qc.weighting, QueryWeighting::Idf);
1806        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1807        assert_eq!(qc.max_query_dims, Some(25));
1808        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1809
1810        // Verify schema roundtrip
1811        let schema = indexes[0].to_schema();
1812        let field = schema.get_field("embedding").unwrap();
1813        let entry = schema.get_field_entry(field).unwrap();
1814        let sc = entry.sparse_vector_config.as_ref().unwrap();
1815        let rqc = sc.query_config.as_ref().unwrap();
1816        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1817        assert_eq!(rqc.max_query_dims, Some(25));
1818        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1819    }
1820
1821    #[test]
1822    fn test_fast_attribute() {
1823        let sdl = r#"
1824            index products {
1825                field name: text [indexed, stored]
1826                field price: f64 [indexed, fast]
1827                field category: text [indexed, stored, fast]
1828                field count: u64 [fast]
1829                field score: i64 [indexed, stored, fast]
1830            }
1831        "#;
1832
1833        let indexes = parse_sdl(sdl).unwrap();
1834        assert_eq!(indexes.len(), 1);
1835        let index = &indexes[0];
1836        assert_eq!(index.fields.len(), 5);
1837
1838        // name: no fast
1839        assert!(!index.fields[0].fast);
1840        // price: fast
1841        assert!(index.fields[1].fast);
1842        assert!(matches!(index.fields[1].field_type, FieldType::F64));
1843        // category: fast text
1844        assert!(index.fields[2].fast);
1845        assert!(matches!(index.fields[2].field_type, FieldType::Text));
1846        // count: fast only
1847        assert!(index.fields[3].fast);
1848        assert!(matches!(index.fields[3].field_type, FieldType::U64));
1849        // score: fast i64
1850        assert!(index.fields[4].fast);
1851        assert!(matches!(index.fields[4].field_type, FieldType::I64));
1852
1853        // Verify schema roundtrip preserves fast flag
1854        let schema = index.to_schema();
1855        let price_field = schema.get_field("price").unwrap();
1856        assert!(schema.get_field_entry(price_field).unwrap().fast);
1857
1858        let category_field = schema.get_field("category").unwrap();
1859        assert!(schema.get_field_entry(category_field).unwrap().fast);
1860
1861        let name_field = schema.get_field("name").unwrap();
1862        assert!(!schema.get_field_entry(name_field).unwrap().fast);
1863    }
1864
1865    #[test]
1866    fn test_primary_attribute() {
1867        let sdl = r#"
1868            index documents {
1869                field id: text [primary, stored]
1870                field title: text [indexed, stored]
1871            }
1872        "#;
1873
1874        let indexes = parse_sdl(sdl).unwrap();
1875        assert_eq!(indexes.len(), 1);
1876        let index = &indexes[0];
1877        assert_eq!(index.fields.len(), 2);
1878
1879        // id should be primary, and auto-set fast + indexed
1880        let id_field = &index.fields[0];
1881        assert!(id_field.primary, "id should be primary");
1882        assert!(id_field.fast, "primary implies fast");
1883        assert!(id_field.indexed, "primary implies indexed");
1884
1885        // title should NOT be primary
1886        assert!(!index.fields[1].primary);
1887
1888        // Verify schema conversion preserves primary_key
1889        let schema = index.to_schema();
1890        let id = schema.get_field("id").unwrap();
1891        let id_entry = schema.get_field_entry(id).unwrap();
1892        assert!(id_entry.primary_key);
1893        assert!(id_entry.fast);
1894        assert!(id_entry.indexed);
1895
1896        let title = schema.get_field("title").unwrap();
1897        assert!(!schema.get_field_entry(title).unwrap().primary_key);
1898
1899        // primary_field() should return the primary field
1900        assert_eq!(schema.primary_field(), Some(id));
1901    }
1902
1903    #[test]
1904    fn test_primary_with_other_attributes() {
1905        let sdl = r#"
1906            index documents {
1907                field id: text<simple> [primary, indexed, stored]
1908                field body: text [indexed]
1909            }
1910        "#;
1911
1912        let indexes = parse_sdl(sdl).unwrap();
1913        let id_field = &indexes[0].fields[0];
1914        assert!(id_field.primary);
1915        assert!(id_field.indexed);
1916        assert!(id_field.stored);
1917        assert!(id_field.fast);
1918        assert_eq!(id_field.tokenizer, Some("simple".to_string()));
1919    }
1920
1921    #[test]
1922    fn test_primary_only_one_allowed() {
1923        let sdl = r#"
1924            index documents {
1925                field id: text [primary]
1926                field alt_id: text [primary]
1927            }
1928        "#;
1929
1930        let result = parse_sdl(sdl);
1931        assert!(result.is_err());
1932        let err = result.unwrap_err().to_string();
1933        assert!(
1934            err.contains("primary key"),
1935            "Error should mention primary key: {}",
1936            err
1937        );
1938    }
1939
1940    #[test]
1941    fn test_primary_must_be_text() {
1942        let sdl = r#"
1943            index documents {
1944                field id: u64 [primary]
1945            }
1946        "#;
1947
1948        let result = parse_sdl(sdl);
1949        assert!(result.is_err());
1950        let err = result.unwrap_err().to_string();
1951        assert!(
1952            err.contains("text"),
1953            "Error should mention text type: {}",
1954            err
1955        );
1956    }
1957
1958    #[test]
1959    fn test_primary_cannot_be_multi() {
1960        let sdl = r#"
1961            index documents {
1962                field id: text [primary, stored<multi>]
1963            }
1964        "#;
1965
1966        let result = parse_sdl(sdl);
1967        assert!(result.is_err());
1968        let err = result.unwrap_err().to_string();
1969        assert!(err.contains("multi"), "Error should mention multi: {}", err);
1970    }
1971
1972    #[test]
1973    fn test_no_primary_field() {
1974        // Schema without primary field should work fine
1975        let sdl = r#"
1976            index documents {
1977                field title: text [indexed, stored]
1978            }
1979        "#;
1980
1981        let indexes = parse_sdl(sdl).unwrap();
1982        let schema = indexes[0].to_schema();
1983        assert!(schema.primary_field().is_none());
1984    }
1985}