Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62/// Parsed field definition
63#[derive(Debug, Clone)]
64pub struct FieldDef {
65    pub name: String,
66    pub field_type: FieldType,
67    pub indexed: bool,
68    pub stored: bool,
69    /// Tokenizer name for text fields (e.g., "simple", "en_stem", "german")
70    pub tokenizer: Option<String>,
71    /// Whether this field can have multiple values (serialized as array in JSON)
72    pub multi: bool,
73    /// Position tracking mode for phrase queries and multi-field element tracking
74    pub positions: Option<super::schema::PositionMode>,
75    /// Configuration for sparse vector fields
76    pub sparse_vector_config: Option<SparseVectorConfig>,
77    /// Configuration for dense vector fields
78    pub dense_vector_config: Option<DenseVectorConfig>,
79    /// Whether this field has columnar fast-field storage
80    pub fast: bool,
81    /// Whether this field is a primary key (unique constraint)
82    pub primary: bool,
83}
84
85/// Parsed index definition
86#[derive(Debug, Clone)]
87pub struct IndexDef {
88    pub name: String,
89    pub fields: Vec<FieldDef>,
90    pub default_fields: Vec<String>,
91    /// Query router rules for routing queries to specific fields
92    pub query_routers: Vec<QueryRouterRule>,
93}
94
95impl IndexDef {
96    /// Convert to a Schema
97    pub fn to_schema(&self) -> Schema {
98        let mut builder = SchemaBuilder::default();
99
100        for field in &self.fields {
101            let f = match field.field_type {
102                FieldType::Text => {
103                    let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
104                    builder.add_text_field_with_tokenizer(
105                        &field.name,
106                        field.indexed,
107                        field.stored,
108                        tokenizer,
109                    )
110                }
111                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
112                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
113                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
114                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
115                FieldType::Json => builder.add_json_field(&field.name, field.stored),
116                FieldType::SparseVector => {
117                    if let Some(config) = &field.sparse_vector_config {
118                        builder.add_sparse_vector_field_with_config(
119                            &field.name,
120                            field.indexed,
121                            field.stored,
122                            config.clone(),
123                        )
124                    } else {
125                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
126                    }
127                }
128                FieldType::DenseVector => {
129                    // Dense vector dimension must be specified via config
130                    let config = field
131                        .dense_vector_config
132                        .as_ref()
133                        .expect("DenseVector field requires dimension to be specified");
134                    builder.add_dense_vector_field_with_config(
135                        &field.name,
136                        field.indexed,
137                        field.stored,
138                        config.clone(),
139                    )
140                }
141            };
142            if field.multi {
143                builder.set_multi(f, true);
144            }
145            if field.fast {
146                builder.set_fast(f, true);
147            }
148            if field.primary {
149                builder.set_primary_key(f);
150            }
151            // Set positions: explicit > auto (ordinal for multi vectors)
152            let positions = field.positions.or({
153                // Auto-set ordinal positions for multi-valued vector fields
154                if field.multi
155                    && matches!(
156                        field.field_type,
157                        FieldType::SparseVector | FieldType::DenseVector
158                    )
159                {
160                    Some(super::schema::PositionMode::Ordinal)
161                } else {
162                    None
163                }
164            });
165            if let Some(mode) = positions {
166                builder.set_positions(f, mode);
167            }
168        }
169
170        // Set default fields if specified
171        if !self.default_fields.is_empty() {
172            builder.set_default_fields(self.default_fields.clone());
173        }
174
175        // Set query routers if specified
176        if !self.query_routers.is_empty() {
177            builder.set_query_routers(self.query_routers.clone());
178        }
179
180        builder.build()
181    }
182
183    /// Create a QueryFieldRouter from the query router rules
184    ///
185    /// Returns None if there are no query router rules defined.
186    /// Returns Err if any regex pattern is invalid.
187    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
188        if self.query_routers.is_empty() {
189            return Ok(None);
190        }
191
192        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
193            .map(Some)
194            .map_err(Error::Schema)
195    }
196}
197
198/// Parse field type from string
199fn parse_field_type(type_str: &str) -> Result<FieldType> {
200    match type_str {
201        "text" | "string" | "str" => Ok(FieldType::Text),
202        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
203        "i64" | "int" | "integer" => Ok(FieldType::I64),
204        "f64" | "float" | "double" => Ok(FieldType::F64),
205        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
206        "json" => Ok(FieldType::Json),
207        "sparse_vector" => Ok(FieldType::SparseVector),
208        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
209        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
210    }
211}
212
213/// Index configuration parsed from indexed<...> attribute
214#[derive(Debug, Clone, Default)]
215struct IndexConfig {
216    index_type: Option<super::schema::VectorIndexType>,
217    num_clusters: Option<usize>,
218    nprobe: Option<usize>,
219    build_threshold: Option<usize>,
220    // Sparse vector index params
221    quantization: Option<WeightQuantization>,
222    weight_threshold: Option<f32>,
223    block_size: Option<usize>,
224    pruning: Option<f32>,
225    // Sparse vector query-time config
226    query_tokenizer: Option<String>,
227    query_weighting: Option<QueryWeighting>,
228    query_weight_threshold: Option<f32>,
229    query_max_dims: Option<usize>,
230    query_pruning: Option<f32>,
231    // Position tracking mode for phrase queries
232    positions: Option<super::schema::PositionMode>,
233}
234
235/// Parse attributes from pest pair
236/// Returns (indexed, stored, multi, fast, primary, index_config)
237/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
238/// multi is now inside stored<multi>
239fn parse_attributes(
240    pair: pest::iterators::Pair<Rule>,
241) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
242    let mut indexed = false;
243    let mut stored = false;
244    let mut multi = false;
245    let mut fast = false;
246    let mut primary = false;
247    let mut index_config = None;
248
249    for attr in pair.into_inner() {
250        if attr.as_rule() == Rule::attribute {
251            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" | "fast" | "primary" }
252            let mut found_config = false;
253            for inner in attr.clone().into_inner() {
254                match inner.as_rule() {
255                    Rule::indexed_with_config => {
256                        indexed = true;
257                        index_config = Some(parse_index_config(inner));
258                        found_config = true;
259                        break;
260                    }
261                    Rule::stored_with_config => {
262                        stored = true;
263                        multi = true; // stored<multi>
264                        found_config = true;
265                        break;
266                    }
267                    _ => {}
268                }
269            }
270            if !found_config {
271                // Simple attribute
272                match attr.as_str() {
273                    "indexed" => indexed = true,
274                    "stored" => stored = true,
275                    "fast" => fast = true,
276                    "primary" => primary = true,
277                    _ => {}
278                }
279            }
280        }
281    }
282
283    (indexed, stored, multi, fast, primary, index_config)
284}
285
286/// Parse index configuration from indexed<...> attribute
287fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
288    let mut config = IndexConfig::default();
289
290    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
291    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
292    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
293
294    for inner in pair.into_inner() {
295        if inner.as_rule() == Rule::index_config_params {
296            for param in inner.into_inner() {
297                if param.as_rule() == Rule::index_config_param {
298                    for p in param.into_inner() {
299                        parse_single_index_config_param(&mut config, p);
300                    }
301                }
302            }
303        }
304    }
305
306    config
307}
308
309/// Parse a single index config parameter
310fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
311    use super::schema::VectorIndexType;
312
313    match p.as_rule() {
314        Rule::index_type_spec => {
315            config.index_type = Some(match p.as_str() {
316                "flat" => VectorIndexType::Flat,
317                "rabitq" => VectorIndexType::RaBitQ,
318                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
319                "scann" => VectorIndexType::ScaNN,
320                _ => VectorIndexType::RaBitQ,
321            });
322        }
323        Rule::index_type_kwarg => {
324            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
325            if let Some(t) = p.into_inner().next() {
326                config.index_type = Some(match t.as_str() {
327                    "flat" => VectorIndexType::Flat,
328                    "rabitq" => VectorIndexType::RaBitQ,
329                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
330                    "scann" => VectorIndexType::ScaNN,
331                    _ => VectorIndexType::RaBitQ,
332                });
333            }
334        }
335        Rule::num_clusters_kwarg => {
336            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
337            if let Some(n) = p.into_inner().next() {
338                config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
339                    log::warn!(
340                        "Invalid num_clusters value '{}', using default 256",
341                        n.as_str()
342                    );
343                    256
344                }));
345            }
346        }
347        Rule::build_threshold_kwarg => {
348            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
349            if let Some(n) = p.into_inner().next() {
350                config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
351                    log::warn!(
352                        "Invalid build_threshold value '{}', using default 10000",
353                        n.as_str()
354                    );
355                    10000
356                }));
357            }
358        }
359        Rule::nprobe_kwarg => {
360            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
361            if let Some(n) = p.into_inner().next() {
362                config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
363                    log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
364                    32
365                }));
366            }
367        }
368        Rule::quantization_kwarg => {
369            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
370            if let Some(q) = p.into_inner().next() {
371                config.quantization = Some(match q.as_str() {
372                    "float32" | "f32" => WeightQuantization::Float32,
373                    "float16" | "f16" => WeightQuantization::Float16,
374                    "uint8" | "u8" => WeightQuantization::UInt8,
375                    "uint4" | "u4" => WeightQuantization::UInt4,
376                    _ => WeightQuantization::default(),
377                });
378            }
379        }
380        Rule::weight_threshold_kwarg => {
381            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
382            if let Some(t) = p.into_inner().next() {
383                config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
384                    log::warn!(
385                        "Invalid weight_threshold value '{}', using default 0.0",
386                        t.as_str()
387                    );
388                    0.0
389                }));
390            }
391        }
392        Rule::block_size_kwarg => {
393            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
394            if let Some(n) = p.into_inner().next() {
395                config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
396                    log::warn!(
397                        "Invalid block_size value '{}', using default 128",
398                        n.as_str()
399                    );
400                    128
401                }));
402            }
403        }
404        Rule::pruning_kwarg => {
405            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
406            if let Some(f) = p.into_inner().next() {
407                config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
408                    log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
409                    1.0
410                }));
411            }
412        }
413        Rule::query_config_block => {
414            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
415            parse_query_config_block(config, p);
416        }
417        Rule::positions_kwarg => {
418            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
419            use super::schema::PositionMode;
420            config.positions = Some(match p.as_str() {
421                "ordinal" => PositionMode::Ordinal,
422                "token_position" => PositionMode::TokenPosition,
423                _ => PositionMode::Full, // "positions" or any other value defaults to Full
424            });
425        }
426        _ => {}
427    }
428}
429
430/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
431fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
432    for inner in pair.into_inner() {
433        if inner.as_rule() == Rule::query_config_params {
434            for param in inner.into_inner() {
435                if param.as_rule() == Rule::query_config_param {
436                    for p in param.into_inner() {
437                        match p.as_rule() {
438                            Rule::query_tokenizer_kwarg => {
439                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
440                                if let Some(path) = p.into_inner().next()
441                                    && let Some(inner_path) = path.into_inner().next()
442                                {
443                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
444                                }
445                            }
446                            Rule::query_weighting_kwarg => {
447                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
448                                if let Some(w) = p.into_inner().next() {
449                                    config.query_weighting = Some(match w.as_str() {
450                                        "one" => QueryWeighting::One,
451                                        "idf" => QueryWeighting::Idf,
452                                        "idf_file" => QueryWeighting::IdfFile,
453                                        _ => QueryWeighting::One,
454                                    });
455                                }
456                            }
457                            Rule::query_weight_threshold_kwarg => {
458                                if let Some(t) = p.into_inner().next() {
459                                    config.query_weight_threshold =
460                                        Some(t.as_str().parse().unwrap_or_else(|_| {
461                                            log::warn!(
462                                                "Invalid query weight_threshold '{}', using 0.0",
463                                                t.as_str()
464                                            );
465                                            0.0
466                                        }));
467                                }
468                            }
469                            Rule::query_max_dims_kwarg => {
470                                if let Some(t) = p.into_inner().next() {
471                                    config.query_max_dims =
472                                        Some(t.as_str().parse().unwrap_or_else(|_| {
473                                            log::warn!(
474                                                "Invalid query max_dims '{}', using 0",
475                                                t.as_str()
476                                            );
477                                            0
478                                        }));
479                                }
480                            }
481                            Rule::query_pruning_kwarg => {
482                                if let Some(t) = p.into_inner().next() {
483                                    config.query_pruning =
484                                        Some(t.as_str().parse().unwrap_or_else(|_| {
485                                            log::warn!(
486                                                "Invalid query pruning '{}', using 1.0",
487                                                t.as_str()
488                                            );
489                                            1.0
490                                        }));
491                                }
492                            }
493                            _ => {}
494                        }
495                    }
496                }
497            }
498        }
499    }
500}
501
502/// Parse a field definition from pest pair
503fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
504    let mut inner = pair.into_inner();
505
506    let name = inner
507        .next()
508        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
509        .as_str()
510        .to_string();
511
512    let field_type_str = inner
513        .next()
514        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
515        .as_str();
516
517    let field_type = parse_field_type(field_type_str)?;
518
519    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
520    let mut tokenizer = None;
521    let mut sparse_vector_config = None;
522    let mut dense_vector_config = None;
523    let mut indexed = true;
524    let mut stored = true;
525    let mut multi = false;
526    let mut fast = false;
527    let mut primary = false;
528    let mut index_config: Option<IndexConfig> = None;
529
530    for item in inner {
531        match item.as_rule() {
532            Rule::tokenizer_spec => {
533                // Extract tokenizer name from <name>
534                if let Some(tok_name) = item.into_inner().next() {
535                    tokenizer = Some(tok_name.as_str().to_string());
536                }
537            }
538            Rule::sparse_vector_config => {
539                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
540                sparse_vector_config = Some(parse_sparse_vector_config(item));
541            }
542            Rule::dense_vector_config => {
543                // Parse dense_vector_params (keyword or positional) - only dims
544                dense_vector_config = Some(parse_dense_vector_config(item));
545            }
546            Rule::attributes => {
547                let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
548                indexed = idx;
549                stored = sto;
550                multi = mul;
551                fast = fst;
552                primary = pri;
553                index_config = idx_cfg;
554            }
555            _ => {}
556        }
557    }
558
559    // Primary key implies fast + indexed (needed for dedup lookups)
560    if primary {
561        fast = true;
562        indexed = true;
563    }
564
565    // Merge index config into vector configs if both exist
566    let mut positions = None;
567    if let Some(idx_cfg) = index_config {
568        positions = idx_cfg.positions;
569        if let Some(ref mut dv_config) = dense_vector_config {
570            apply_index_config_to_dense_vector(dv_config, idx_cfg);
571        } else if field_type == FieldType::SparseVector {
572            // For sparse vectors, create default config if not present and apply index params
573            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
574            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
575        }
576    }
577
578    Ok(FieldDef {
579        name,
580        field_type,
581        indexed,
582        stored,
583        tokenizer,
584        multi,
585        positions,
586        sparse_vector_config,
587        dense_vector_config,
588        fast,
589        primary,
590    })
591}
592
593/// Apply index configuration from indexed<...> to DenseVectorConfig
594fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
595    // Apply index type if specified
596    if let Some(index_type) = idx_cfg.index_type {
597        config.index_type = index_type;
598    }
599
600    // Apply num_clusters for IVF-based indexes
601    if idx_cfg.num_clusters.is_some() {
602        config.num_clusters = idx_cfg.num_clusters;
603    }
604
605    // Apply nprobe if specified
606    if let Some(nprobe) = idx_cfg.nprobe {
607        config.nprobe = nprobe;
608    }
609
610    // Apply build_threshold if specified
611    if idx_cfg.build_threshold.is_some() {
612        config.build_threshold = idx_cfg.build_threshold;
613    }
614}
615
616/// Parse sparse_vector_config - only index_size (positional)
617/// Example: <u16> or <u32>
618fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
619    let mut index_size = IndexSize::default();
620
621    // Parse positional index_size_spec
622    for inner in pair.into_inner() {
623        if inner.as_rule() == Rule::index_size_spec {
624            index_size = match inner.as_str() {
625                "u16" => IndexSize::U16,
626                "u32" => IndexSize::U32,
627                _ => IndexSize::default(),
628            };
629        }
630    }
631
632    SparseVectorConfig {
633        index_size,
634        weight_quantization: WeightQuantization::default(),
635        weight_threshold: 0.0,
636        block_size: 128,
637        pruning: None,
638        query_config: None,
639    }
640}
641
642/// Apply index configuration from indexed<...> to SparseVectorConfig
643fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
644    if let Some(q) = idx_cfg.quantization {
645        config.weight_quantization = q;
646    }
647    if let Some(t) = idx_cfg.weight_threshold {
648        config.weight_threshold = t;
649    }
650    if let Some(bs) = idx_cfg.block_size {
651        let adjusted = bs.next_power_of_two();
652        if adjusted != bs {
653            log::warn!(
654                "block_size {} adjusted to next power of two: {}",
655                bs,
656                adjusted
657            );
658        }
659        config.block_size = adjusted;
660    }
661    if let Some(p) = idx_cfg.pruning {
662        let clamped = p.clamp(0.0, 1.0);
663        if (clamped - p).abs() > f32::EPSILON {
664            log::warn!(
665                "pruning {} clamped to valid range [0.0, 1.0]: {}",
666                p,
667                clamped
668            );
669        }
670        config.pruning = Some(clamped);
671    }
672    // Apply query-time configuration if present
673    if idx_cfg.query_tokenizer.is_some()
674        || idx_cfg.query_weighting.is_some()
675        || idx_cfg.query_weight_threshold.is_some()
676        || idx_cfg.query_max_dims.is_some()
677        || idx_cfg.query_pruning.is_some()
678    {
679        let query_config = config
680            .query_config
681            .get_or_insert(SparseQueryConfig::default());
682        if let Some(tokenizer) = idx_cfg.query_tokenizer {
683            query_config.tokenizer = Some(tokenizer);
684        }
685        if let Some(weighting) = idx_cfg.query_weighting {
686            query_config.weighting = weighting;
687        }
688        if let Some(t) = idx_cfg.query_weight_threshold {
689            query_config.weight_threshold = t;
690        }
691        if let Some(d) = idx_cfg.query_max_dims {
692            query_config.max_query_dims = Some(d);
693        }
694        if let Some(p) = idx_cfg.query_pruning {
695            query_config.pruning = Some(p);
696        }
697    }
698}
699
700/// Parse dense_vector_config - dims and optional quantization type
701/// All index-related params are in indexed<...> attribute
702fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
703    let mut dim: usize = 0;
704    let mut quantization = DenseVectorQuantization::F32;
705
706    // Navigate to dense_vector_params
707    for params in pair.into_inner() {
708        if params.as_rule() == Rule::dense_vector_params {
709            for inner in params.into_inner() {
710                match inner.as_rule() {
711                    Rule::dense_vector_keyword_params => {
712                        for kwarg in inner.into_inner() {
713                            match kwarg.as_rule() {
714                                Rule::dims_kwarg => {
715                                    if let Some(d) = kwarg.into_inner().next() {
716                                        dim = d.as_str().parse().unwrap_or(0);
717                                    }
718                                }
719                                Rule::quant_type_spec => {
720                                    quantization = parse_quant_type(kwarg.as_str());
721                                }
722                                _ => {}
723                            }
724                        }
725                    }
726                    Rule::dense_vector_positional_params => {
727                        for item in inner.into_inner() {
728                            match item.as_rule() {
729                                Rule::dimension_spec => {
730                                    dim = item.as_str().parse().unwrap_or(0);
731                                }
732                                Rule::quant_type_spec => {
733                                    quantization = parse_quant_type(item.as_str());
734                                }
735                                _ => {}
736                            }
737                        }
738                    }
739                    _ => {}
740                }
741            }
742        }
743    }
744
745    DenseVectorConfig::new(dim).with_quantization(quantization)
746}
747
748fn parse_quant_type(s: &str) -> DenseVectorQuantization {
749    match s.trim() {
750        "f16" => DenseVectorQuantization::F16,
751        "uint8" | "u8" => DenseVectorQuantization::UInt8,
752        _ => DenseVectorQuantization::F32,
753    }
754}
755
756/// Parse default_fields definition
757fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
758    pair.into_inner().map(|p| p.as_str().to_string()).collect()
759}
760
761/// Parse a query router definition
762fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
763    let mut pattern = String::new();
764    let mut substitution = String::new();
765    let mut target_field = String::new();
766    let mut mode = RoutingMode::Additional;
767
768    for prop in pair.into_inner() {
769        if prop.as_rule() != Rule::query_router_prop {
770            continue;
771        }
772
773        for inner in prop.into_inner() {
774            match inner.as_rule() {
775                Rule::query_router_pattern => {
776                    if let Some(regex_str) = inner.into_inner().next() {
777                        pattern = parse_string_value(regex_str);
778                    }
779                }
780                Rule::query_router_substitution => {
781                    if let Some(quoted) = inner.into_inner().next() {
782                        substitution = parse_string_value(quoted);
783                    }
784                }
785                Rule::query_router_target => {
786                    if let Some(ident) = inner.into_inner().next() {
787                        target_field = ident.as_str().to_string();
788                    }
789                }
790                Rule::query_router_mode => {
791                    if let Some(mode_val) = inner.into_inner().next() {
792                        mode = match mode_val.as_str() {
793                            "exclusive" => RoutingMode::Exclusive,
794                            "additional" => RoutingMode::Additional,
795                            _ => RoutingMode::Additional,
796                        };
797                    }
798                }
799                _ => {}
800            }
801        }
802    }
803
804    if pattern.is_empty() {
805        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
806    }
807    if substitution.is_empty() {
808        return Err(Error::Schema(
809            "query_router missing 'substitution'".to_string(),
810        ));
811    }
812    if target_field.is_empty() {
813        return Err(Error::Schema(
814            "query_router missing 'target_field'".to_string(),
815        ));
816    }
817
818    Ok(QueryRouterRule {
819        pattern,
820        substitution,
821        target_field,
822        mode,
823    })
824}
825
826/// Parse a string value from quoted_string, raw_string, or regex_string
827fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
828    let s = pair.as_str();
829    match pair.as_rule() {
830        Rule::regex_string => {
831            // regex_string contains either raw_string or quoted_string
832            if let Some(inner) = pair.into_inner().next() {
833                parse_string_value(inner)
834            } else {
835                s.to_string()
836            }
837        }
838        Rule::raw_string => {
839            // r"..." - strip r" prefix and " suffix
840            s[2..s.len() - 1].to_string()
841        }
842        Rule::quoted_string => {
843            // "..." - strip quotes and handle escapes
844            let inner = &s[1..s.len() - 1];
845            // Simple escape handling
846            inner
847                .replace("\\n", "\n")
848                .replace("\\t", "\t")
849                .replace("\\\"", "\"")
850                .replace("\\\\", "\\")
851        }
852        _ => s.to_string(),
853    }
854}
855
856/// Parse an index definition from pest pair
857fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
858    let mut inner = pair.into_inner();
859
860    let name = inner
861        .next()
862        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
863        .as_str()
864        .to_string();
865
866    let mut fields = Vec::new();
867    let mut default_fields = Vec::new();
868    let mut query_routers = Vec::new();
869
870    for item in inner {
871        match item.as_rule() {
872            Rule::field_def => {
873                fields.push(parse_field_def(item)?);
874            }
875            Rule::default_fields_def => {
876                default_fields = parse_default_fields_def(item);
877            }
878            Rule::query_router_def => {
879                query_routers.push(parse_query_router_def(item)?);
880            }
881            _ => {}
882        }
883    }
884
885    // Validate primary key constraints
886    let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
887    if primary_fields.len() > 1 {
888        return Err(Error::Schema(format!(
889            "Index '{}' has {} primary key fields, but at most one is allowed",
890            name,
891            primary_fields.len()
892        )));
893    }
894    if let Some(pk) = primary_fields.first() {
895        if pk.field_type != FieldType::Text {
896            return Err(Error::Schema(format!(
897                "Primary key field '{}' must be of type text, got {:?}",
898                pk.name, pk.field_type
899            )));
900        }
901        if pk.multi {
902            return Err(Error::Schema(format!(
903                "Primary key field '{}' cannot be multi-valued",
904                pk.name
905            )));
906        }
907    }
908
909    Ok(IndexDef {
910        name,
911        fields,
912        default_fields,
913        query_routers,
914    })
915}
916
917/// Parse SDL from a string
918pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
919    let pairs = SdlParser::parse(Rule::file, input)
920        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
921
922    let mut indexes = Vec::new();
923
924    for pair in pairs {
925        if pair.as_rule() == Rule::file {
926            for inner in pair.into_inner() {
927                if inner.as_rule() == Rule::index_def {
928                    indexes.push(parse_index_def(inner)?);
929                }
930            }
931        }
932    }
933
934    Ok(indexes)
935}
936
937/// Parse SDL and return a single index definition
938pub fn parse_single_index(input: &str) -> Result<IndexDef> {
939    let indexes = parse_sdl(input)?;
940
941    if indexes.is_empty() {
942        return Err(Error::Schema("No index definition found".to_string()));
943    }
944
945    if indexes.len() > 1 {
946        return Err(Error::Schema(
947            "Multiple index definitions found, expected one".to_string(),
948        ));
949    }
950
951    Ok(indexes.into_iter().next().unwrap())
952}
953
954#[cfg(test)]
955mod tests {
956    use super::*;
957
958    #[test]
959    fn test_parse_simple_schema() {
960        let sdl = r#"
961            index articles {
962                field title: text [indexed, stored]
963                field body: text [indexed]
964            }
965        "#;
966
967        let indexes = parse_sdl(sdl).unwrap();
968        assert_eq!(indexes.len(), 1);
969
970        let index = &indexes[0];
971        assert_eq!(index.name, "articles");
972        assert_eq!(index.fields.len(), 2);
973
974        assert_eq!(index.fields[0].name, "title");
975        assert!(matches!(index.fields[0].field_type, FieldType::Text));
976        assert!(index.fields[0].indexed);
977        assert!(index.fields[0].stored);
978
979        assert_eq!(index.fields[1].name, "body");
980        assert!(matches!(index.fields[1].field_type, FieldType::Text));
981        assert!(index.fields[1].indexed);
982        assert!(!index.fields[1].stored);
983    }
984
985    #[test]
986    fn test_parse_all_field_types() {
987        let sdl = r#"
988            index test {
989                field text_field: text [indexed, stored]
990                field u64_field: u64 [indexed, stored]
991                field i64_field: i64 [indexed, stored]
992                field f64_field: f64 [indexed, stored]
993                field bytes_field: bytes [stored]
994            }
995        "#;
996
997        let indexes = parse_sdl(sdl).unwrap();
998        let index = &indexes[0];
999
1000        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1001        assert!(matches!(index.fields[1].field_type, FieldType::U64));
1002        assert!(matches!(index.fields[2].field_type, FieldType::I64));
1003        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1004        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1005    }
1006
1007    #[test]
1008    fn test_parse_with_comments() {
1009        let sdl = r#"
1010            # This is a comment
1011            index articles {
1012                # Title field
1013                field title: text [indexed, stored]
1014                field body: text [indexed] # inline comment not supported yet
1015            }
1016        "#;
1017
1018        let indexes = parse_sdl(sdl).unwrap();
1019        assert_eq!(indexes[0].fields.len(), 2);
1020    }
1021
1022    #[test]
1023    fn test_parse_type_aliases() {
1024        let sdl = r#"
1025            index test {
1026                field a: string [indexed]
1027                field b: int [indexed]
1028                field c: uint [indexed]
1029                field d: float [indexed]
1030                field e: binary [stored]
1031            }
1032        "#;
1033
1034        let indexes = parse_sdl(sdl).unwrap();
1035        let index = &indexes[0];
1036
1037        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1038        assert!(matches!(index.fields[1].field_type, FieldType::I64));
1039        assert!(matches!(index.fields[2].field_type, FieldType::U64));
1040        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1041        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1042    }
1043
1044    #[test]
1045    fn test_to_schema() {
1046        let sdl = r#"
1047            index articles {
1048                field title: text [indexed, stored]
1049                field views: u64 [indexed, stored]
1050            }
1051        "#;
1052
1053        let indexes = parse_sdl(sdl).unwrap();
1054        let schema = indexes[0].to_schema();
1055
1056        assert!(schema.get_field("title").is_some());
1057        assert!(schema.get_field("views").is_some());
1058        assert!(schema.get_field("nonexistent").is_none());
1059    }
1060
1061    #[test]
1062    fn test_default_attributes() {
1063        let sdl = r#"
1064            index test {
1065                field title: text
1066            }
1067        "#;
1068
1069        let indexes = parse_sdl(sdl).unwrap();
1070        let field = &indexes[0].fields[0];
1071
1072        // Default should be indexed and stored
1073        assert!(field.indexed);
1074        assert!(field.stored);
1075    }
1076
1077    #[test]
1078    fn test_multiple_indexes() {
1079        let sdl = r#"
1080            index articles {
1081                field title: text [indexed, stored]
1082            }
1083
1084            index users {
1085                field name: text [indexed, stored]
1086                field email: text [indexed, stored]
1087            }
1088        "#;
1089
1090        let indexes = parse_sdl(sdl).unwrap();
1091        assert_eq!(indexes.len(), 2);
1092        assert_eq!(indexes[0].name, "articles");
1093        assert_eq!(indexes[1].name, "users");
1094    }
1095
1096    #[test]
1097    fn test_tokenizer_spec() {
1098        let sdl = r#"
1099            index articles {
1100                field title: text<en_stem> [indexed, stored]
1101                field body: text<simple> [indexed]
1102                field author: text [indexed, stored]
1103            }
1104        "#;
1105
1106        let indexes = parse_sdl(sdl).unwrap();
1107        let index = &indexes[0];
1108
1109        assert_eq!(index.fields[0].name, "title");
1110        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1111
1112        assert_eq!(index.fields[1].name, "body");
1113        assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1114
1115        assert_eq!(index.fields[2].name, "author");
1116        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1117    }
1118
1119    #[test]
1120    fn test_tokenizer_in_schema() {
1121        let sdl = r#"
1122            index articles {
1123                field title: text<german> [indexed, stored]
1124                field body: text<en_stem> [indexed]
1125            }
1126        "#;
1127
1128        let indexes = parse_sdl(sdl).unwrap();
1129        let schema = indexes[0].to_schema();
1130
1131        let title_field = schema.get_field("title").unwrap();
1132        let title_entry = schema.get_field_entry(title_field).unwrap();
1133        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1134
1135        let body_field = schema.get_field("body").unwrap();
1136        let body_entry = schema.get_field_entry(body_field).unwrap();
1137        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1138    }
1139
1140    #[test]
1141    fn test_query_router_basic() {
1142        let sdl = r#"
1143            index documents {
1144                field title: text [indexed, stored]
1145                field uri: text [indexed, stored]
1146
1147                query_router {
1148                    pattern: "10\\.\\d{4,}/[^\\s]+"
1149                    substitution: "doi://{0}"
1150                    target_field: uris
1151                    mode: exclusive
1152                }
1153            }
1154        "#;
1155
1156        let indexes = parse_sdl(sdl).unwrap();
1157        let index = &indexes[0];
1158
1159        assert_eq!(index.query_routers.len(), 1);
1160        let router = &index.query_routers[0];
1161        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1162        assert_eq!(router.substitution, "doi://{0}");
1163        assert_eq!(router.target_field, "uris");
1164        assert_eq!(router.mode, RoutingMode::Exclusive);
1165    }
1166
1167    #[test]
1168    fn test_query_router_raw_string() {
1169        let sdl = r#"
1170            index documents {
1171                field uris: text [indexed, stored]
1172
1173                query_router {
1174                    pattern: r"^pmid:(\d+)$"
1175                    substitution: "pubmed://{1}"
1176                    target_field: uris
1177                    mode: additional
1178                }
1179            }
1180        "#;
1181
1182        let indexes = parse_sdl(sdl).unwrap();
1183        let router = &indexes[0].query_routers[0];
1184
1185        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1186        assert_eq!(router.substitution, "pubmed://{1}");
1187        assert_eq!(router.mode, RoutingMode::Additional);
1188    }
1189
1190    #[test]
1191    fn test_multiple_query_routers() {
1192        let sdl = r#"
1193            index documents {
1194                field uris: text [indexed, stored]
1195
1196                query_router {
1197                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1198                    substitution: "doi://{1}"
1199                    target_field: uris
1200                    mode: exclusive
1201                }
1202
1203                query_router {
1204                    pattern: r"^pmid:(\d+)$"
1205                    substitution: "pubmed://{1}"
1206                    target_field: uris
1207                    mode: exclusive
1208                }
1209
1210                query_router {
1211                    pattern: r"^arxiv:(\d+\.\d+)$"
1212                    substitution: "arxiv://{1}"
1213                    target_field: uris
1214                    mode: additional
1215                }
1216            }
1217        "#;
1218
1219        let indexes = parse_sdl(sdl).unwrap();
1220        assert_eq!(indexes[0].query_routers.len(), 3);
1221    }
1222
1223    #[test]
1224    fn test_query_router_default_mode() {
1225        let sdl = r#"
1226            index documents {
1227                field uris: text [indexed, stored]
1228
1229                query_router {
1230                    pattern: r"test"
1231                    substitution: "{0}"
1232                    target_field: uris
1233                }
1234            }
1235        "#;
1236
1237        let indexes = parse_sdl(sdl).unwrap();
1238        // Default mode should be Additional
1239        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1240    }
1241
1242    #[test]
1243    fn test_multi_attribute() {
1244        let sdl = r#"
1245            index documents {
1246                field uris: text [indexed, stored<multi>]
1247                field title: text [indexed, stored]
1248            }
1249        "#;
1250
1251        let indexes = parse_sdl(sdl).unwrap();
1252        assert_eq!(indexes.len(), 1);
1253
1254        let fields = &indexes[0].fields;
1255        assert_eq!(fields.len(), 2);
1256
1257        // uris should have multi=true
1258        assert_eq!(fields[0].name, "uris");
1259        assert!(fields[0].multi, "uris field should have multi=true");
1260
1261        // title should have multi=false
1262        assert_eq!(fields[1].name, "title");
1263        assert!(!fields[1].multi, "title field should have multi=false");
1264
1265        // Verify schema conversion preserves multi attribute
1266        let schema = indexes[0].to_schema();
1267        let uris_field = schema.get_field("uris").unwrap();
1268        let title_field = schema.get_field("title").unwrap();
1269
1270        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1271        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1272    }
1273
1274    #[test]
1275    fn test_sparse_vector_field() {
1276        let sdl = r#"
1277            index documents {
1278                field embedding: sparse_vector [indexed, stored]
1279            }
1280        "#;
1281
1282        let indexes = parse_sdl(sdl).unwrap();
1283        assert_eq!(indexes.len(), 1);
1284        assert_eq!(indexes[0].fields.len(), 1);
1285        assert_eq!(indexes[0].fields[0].name, "embedding");
1286        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1287        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1288    }
1289
1290    #[test]
1291    fn test_sparse_vector_with_config() {
1292        let sdl = r#"
1293            index documents {
1294                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1295                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1296            }
1297        "#;
1298
1299        let indexes = parse_sdl(sdl).unwrap();
1300        assert_eq!(indexes[0].fields.len(), 2);
1301
1302        // First field: u16 indices, uint8 quantization
1303        let f1 = &indexes[0].fields[0];
1304        assert_eq!(f1.name, "embedding");
1305        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1306        assert_eq!(config1.index_size, IndexSize::U16);
1307        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1308
1309        // Second field: u32 indices, float32 quantization
1310        let f2 = &indexes[0].fields[1];
1311        assert_eq!(f2.name, "dense");
1312        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1313        assert_eq!(config2.index_size, IndexSize::U32);
1314        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1315    }
1316
1317    #[test]
1318    fn test_sparse_vector_with_weight_threshold() {
1319        let sdl = r#"
1320            index documents {
1321                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1322                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1323            }
1324        "#;
1325
1326        let indexes = parse_sdl(sdl).unwrap();
1327        assert_eq!(indexes[0].fields.len(), 2);
1328
1329        // First field: u16 indices, uint8 quantization, threshold 0.1
1330        let f1 = &indexes[0].fields[0];
1331        assert_eq!(f1.name, "embedding");
1332        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1333        assert_eq!(config1.index_size, IndexSize::U16);
1334        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1335        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1336
1337        // Second field: u32 indices, float16 quantization, threshold 0.05
1338        let f2 = &indexes[0].fields[1];
1339        assert_eq!(f2.name, "embedding2");
1340        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1341        assert_eq!(config2.index_size, IndexSize::U32);
1342        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1343        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1344    }
1345
1346    #[test]
1347    fn test_sparse_vector_with_pruning() {
1348        let sdl = r#"
1349            index documents {
1350                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1351            }
1352        "#;
1353
1354        let indexes = parse_sdl(sdl).unwrap();
1355        let f = &indexes[0].fields[0];
1356        assert_eq!(f.name, "embedding");
1357        let config = f.sparse_vector_config.as_ref().unwrap();
1358        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1359        assert_eq!(config.pruning, Some(0.1));
1360    }
1361
1362    #[test]
1363    fn test_dense_vector_field() {
1364        let sdl = r#"
1365            index documents {
1366                field embedding: dense_vector<768> [indexed, stored]
1367            }
1368        "#;
1369
1370        let indexes = parse_sdl(sdl).unwrap();
1371        assert_eq!(indexes.len(), 1);
1372        assert_eq!(indexes[0].fields.len(), 1);
1373
1374        let f = &indexes[0].fields[0];
1375        assert_eq!(f.name, "embedding");
1376        assert_eq!(f.field_type, FieldType::DenseVector);
1377
1378        let config = f.dense_vector_config.as_ref().unwrap();
1379        assert_eq!(config.dim, 768);
1380    }
1381
1382    #[test]
1383    fn test_dense_vector_alias() {
1384        let sdl = r#"
1385            index documents {
1386                field embedding: vector<1536> [indexed]
1387            }
1388        "#;
1389
1390        let indexes = parse_sdl(sdl).unwrap();
1391        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1392        assert_eq!(
1393            indexes[0].fields[0]
1394                .dense_vector_config
1395                .as_ref()
1396                .unwrap()
1397                .dim,
1398            1536
1399        );
1400    }
1401
1402    #[test]
1403    fn test_dense_vector_with_num_clusters() {
1404        let sdl = r#"
1405            index documents {
1406                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1407            }
1408        "#;
1409
1410        let indexes = parse_sdl(sdl).unwrap();
1411        assert_eq!(indexes.len(), 1);
1412
1413        let f = &indexes[0].fields[0];
1414        assert_eq!(f.name, "embedding");
1415        assert_eq!(f.field_type, FieldType::DenseVector);
1416
1417        let config = f.dense_vector_config.as_ref().unwrap();
1418        assert_eq!(config.dim, 768);
1419        assert_eq!(config.num_clusters, Some(256));
1420        assert_eq!(config.nprobe, 32); // default
1421    }
1422
1423    #[test]
1424    fn test_dense_vector_with_num_clusters_and_nprobe() {
1425        let sdl = r#"
1426            index documents {
1427                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1428            }
1429        "#;
1430
1431        let indexes = parse_sdl(sdl).unwrap();
1432        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1433
1434        assert_eq!(config.dim, 1536);
1435        assert_eq!(config.num_clusters, Some(512));
1436        assert_eq!(config.nprobe, 64);
1437    }
1438
1439    #[test]
1440    fn test_dense_vector_keyword_syntax() {
1441        let sdl = r#"
1442            index documents {
1443                field embedding: dense_vector<dims: 1536> [indexed, stored]
1444            }
1445        "#;
1446
1447        let indexes = parse_sdl(sdl).unwrap();
1448        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1449
1450        assert_eq!(config.dim, 1536);
1451        assert!(config.num_clusters.is_none());
1452    }
1453
1454    #[test]
1455    fn test_dense_vector_keyword_syntax_full() {
1456        let sdl = r#"
1457            index documents {
1458                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1459            }
1460        "#;
1461
1462        let indexes = parse_sdl(sdl).unwrap();
1463        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1464
1465        assert_eq!(config.dim, 1536);
1466        assert_eq!(config.num_clusters, Some(256));
1467        assert_eq!(config.nprobe, 64);
1468    }
1469
1470    #[test]
1471    fn test_dense_vector_keyword_syntax_partial() {
1472        let sdl = r#"
1473            index documents {
1474                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1475            }
1476        "#;
1477
1478        let indexes = parse_sdl(sdl).unwrap();
1479        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1480
1481        assert_eq!(config.dim, 768);
1482        assert_eq!(config.num_clusters, Some(128));
1483        assert_eq!(config.nprobe, 32); // default
1484    }
1485
1486    #[test]
1487    fn test_dense_vector_scann_index() {
1488        use crate::dsl::schema::VectorIndexType;
1489
1490        let sdl = r#"
1491            index documents {
1492                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1493            }
1494        "#;
1495
1496        let indexes = parse_sdl(sdl).unwrap();
1497        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1498
1499        assert_eq!(config.dim, 768);
1500        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1501        assert_eq!(config.num_clusters, Some(256));
1502        assert_eq!(config.nprobe, 64);
1503    }
1504
1505    #[test]
1506    fn test_dense_vector_ivf_rabitq_index() {
1507        use crate::dsl::schema::VectorIndexType;
1508
1509        let sdl = r#"
1510            index documents {
1511                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1512            }
1513        "#;
1514
1515        let indexes = parse_sdl(sdl).unwrap();
1516        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1517
1518        assert_eq!(config.dim, 1536);
1519        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1520        assert_eq!(config.num_clusters, Some(512));
1521    }
1522
1523    #[test]
1524    fn test_dense_vector_rabitq_no_clusters() {
1525        use crate::dsl::schema::VectorIndexType;
1526
1527        let sdl = r#"
1528            index documents {
1529                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1530            }
1531        "#;
1532
1533        let indexes = parse_sdl(sdl).unwrap();
1534        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1535
1536        assert_eq!(config.dim, 768);
1537        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1538        assert!(config.num_clusters.is_none());
1539    }
1540
1541    #[test]
1542    fn test_dense_vector_flat_index() {
1543        use crate::dsl::schema::VectorIndexType;
1544
1545        let sdl = r#"
1546            index documents {
1547                field embedding: dense_vector<dims: 768> [indexed<flat>]
1548            }
1549        "#;
1550
1551        let indexes = parse_sdl(sdl).unwrap();
1552        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1553
1554        assert_eq!(config.dim, 768);
1555        assert_eq!(config.index_type, VectorIndexType::Flat);
1556    }
1557
1558    #[test]
1559    fn test_dense_vector_default_index_type() {
1560        use crate::dsl::schema::VectorIndexType;
1561
1562        // When no index type specified, should default to RaBitQ (basic)
1563        let sdl = r#"
1564            index documents {
1565                field embedding: dense_vector<dims: 768> [indexed]
1566            }
1567        "#;
1568
1569        let indexes = parse_sdl(sdl).unwrap();
1570        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1571
1572        assert_eq!(config.dim, 768);
1573        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1574    }
1575
1576    #[test]
1577    fn test_dense_vector_f16_quantization() {
1578        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1579
1580        let sdl = r#"
1581            index documents {
1582                field embedding: dense_vector<768, f16> [indexed]
1583            }
1584        "#;
1585
1586        let indexes = parse_sdl(sdl).unwrap();
1587        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1588
1589        assert_eq!(config.dim, 768);
1590        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1591        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1592    }
1593
1594    #[test]
1595    fn test_dense_vector_uint8_quantization() {
1596        use crate::dsl::schema::DenseVectorQuantization;
1597
1598        let sdl = r#"
1599            index documents {
1600                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1601            }
1602        "#;
1603
1604        let indexes = parse_sdl(sdl).unwrap();
1605        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1606
1607        assert_eq!(config.dim, 1024);
1608        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1609    }
1610
1611    #[test]
1612    fn test_dense_vector_u8_alias() {
1613        use crate::dsl::schema::DenseVectorQuantization;
1614
1615        let sdl = r#"
1616            index documents {
1617                field embedding: dense_vector<512, u8> [indexed]
1618            }
1619        "#;
1620
1621        let indexes = parse_sdl(sdl).unwrap();
1622        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1623
1624        assert_eq!(config.dim, 512);
1625        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1626    }
1627
1628    #[test]
1629    fn test_dense_vector_default_f32_quantization() {
1630        use crate::dsl::schema::DenseVectorQuantization;
1631
1632        // No quantization type → default f32
1633        let sdl = r#"
1634            index documents {
1635                field embedding: dense_vector<768> [indexed]
1636            }
1637        "#;
1638
1639        let indexes = parse_sdl(sdl).unwrap();
1640        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1641
1642        assert_eq!(config.dim, 768);
1643        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1644    }
1645
1646    #[test]
1647    fn test_dense_vector_keyword_with_quantization() {
1648        use crate::dsl::schema::DenseVectorQuantization;
1649
1650        let sdl = r#"
1651            index documents {
1652                field embedding: dense_vector<dims: 768, f16> [indexed]
1653            }
1654        "#;
1655
1656        let indexes = parse_sdl(sdl).unwrap();
1657        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1658
1659        assert_eq!(config.dim, 768);
1660        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1661    }
1662
1663    #[test]
1664    fn test_json_field_type() {
1665        let sdl = r#"
1666            index documents {
1667                field title: text [indexed, stored]
1668                field metadata: json [stored]
1669                field extra: json
1670            }
1671        "#;
1672
1673        let indexes = parse_sdl(sdl).unwrap();
1674        let index = &indexes[0];
1675
1676        assert_eq!(index.fields.len(), 3);
1677
1678        // Check JSON field
1679        assert_eq!(index.fields[1].name, "metadata");
1680        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1681        assert!(index.fields[1].stored);
1682        // JSON fields should not be indexed (enforced by add_json_field)
1683
1684        // Check default attributes for JSON field
1685        assert_eq!(index.fields[2].name, "extra");
1686        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1687
1688        // Verify schema conversion
1689        let schema = index.to_schema();
1690        let metadata_field = schema.get_field("metadata").unwrap();
1691        let entry = schema.get_field_entry(metadata_field).unwrap();
1692        assert_eq!(entry.field_type, FieldType::Json);
1693        assert!(!entry.indexed); // JSON fields are never indexed
1694        assert!(entry.stored);
1695    }
1696
1697    #[test]
1698    fn test_sparse_vector_query_config() {
1699        use crate::structures::QueryWeighting;
1700
1701        let sdl = r#"
1702            index documents {
1703                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1704            }
1705        "#;
1706
1707        let indexes = parse_sdl(sdl).unwrap();
1708        let index = &indexes[0];
1709
1710        assert_eq!(index.fields.len(), 1);
1711        assert_eq!(index.fields[0].name, "embedding");
1712        assert!(matches!(
1713            index.fields[0].field_type,
1714            FieldType::SparseVector
1715        ));
1716
1717        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1718        assert_eq!(config.index_size, IndexSize::U16);
1719        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1720
1721        // Check query config
1722        let query_config = config.query_config.as_ref().unwrap();
1723        assert_eq!(
1724            query_config.tokenizer.as_deref(),
1725            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1726        );
1727        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1728
1729        // Verify schema conversion preserves query config
1730        let schema = index.to_schema();
1731        let embedding_field = schema.get_field("embedding").unwrap();
1732        let entry = schema.get_field_entry(embedding_field).unwrap();
1733        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1734        let qc = sv_config.query_config.as_ref().unwrap();
1735        assert_eq!(
1736            qc.tokenizer.as_deref(),
1737            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1738        );
1739        assert_eq!(qc.weighting, QueryWeighting::Idf);
1740    }
1741
1742    #[test]
1743    fn test_sparse_vector_query_config_weighting_one() {
1744        use crate::structures::QueryWeighting;
1745
1746        let sdl = r#"
1747            index documents {
1748                field embedding: sparse_vector [indexed<query<weighting: one>>]
1749            }
1750        "#;
1751
1752        let indexes = parse_sdl(sdl).unwrap();
1753        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1754
1755        let query_config = config.query_config.as_ref().unwrap();
1756        assert!(query_config.tokenizer.is_none());
1757        assert_eq!(query_config.weighting, QueryWeighting::One);
1758    }
1759
1760    #[test]
1761    fn test_sparse_vector_query_config_weighting_idf_file() {
1762        use crate::structures::QueryWeighting;
1763
1764        let sdl = r#"
1765            index documents {
1766                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1767            }
1768        "#;
1769
1770        let indexes = parse_sdl(sdl).unwrap();
1771        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1772
1773        let query_config = config.query_config.as_ref().unwrap();
1774        assert_eq!(
1775            query_config.tokenizer.as_deref(),
1776            Some("opensearch-neural-sparse-encoding-v1")
1777        );
1778        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1779
1780        // Verify schema conversion preserves idf_file
1781        let schema = indexes[0].to_schema();
1782        let field = schema.get_field("embedding").unwrap();
1783        let entry = schema.get_field_entry(field).unwrap();
1784        let sc = entry.sparse_vector_config.as_ref().unwrap();
1785        let qc = sc.query_config.as_ref().unwrap();
1786        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1787    }
1788
1789    #[test]
1790    fn test_sparse_vector_query_config_pruning_params() {
1791        let sdl = r#"
1792            index documents {
1793                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1794            }
1795        "#;
1796
1797        let indexes = parse_sdl(sdl).unwrap();
1798        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1799
1800        let qc = config.query_config.as_ref().unwrap();
1801        assert_eq!(qc.weighting, QueryWeighting::Idf);
1802        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1803        assert_eq!(qc.max_query_dims, Some(25));
1804        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1805
1806        // Verify schema roundtrip
1807        let schema = indexes[0].to_schema();
1808        let field = schema.get_field("embedding").unwrap();
1809        let entry = schema.get_field_entry(field).unwrap();
1810        let sc = entry.sparse_vector_config.as_ref().unwrap();
1811        let rqc = sc.query_config.as_ref().unwrap();
1812        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1813        assert_eq!(rqc.max_query_dims, Some(25));
1814        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1815    }
1816
1817    #[test]
1818    fn test_fast_attribute() {
1819        let sdl = r#"
1820            index products {
1821                field name: text [indexed, stored]
1822                field price: f64 [indexed, fast]
1823                field category: text [indexed, stored, fast]
1824                field count: u64 [fast]
1825                field score: i64 [indexed, stored, fast]
1826            }
1827        "#;
1828
1829        let indexes = parse_sdl(sdl).unwrap();
1830        assert_eq!(indexes.len(), 1);
1831        let index = &indexes[0];
1832        assert_eq!(index.fields.len(), 5);
1833
1834        // name: no fast
1835        assert!(!index.fields[0].fast);
1836        // price: fast
1837        assert!(index.fields[1].fast);
1838        assert!(matches!(index.fields[1].field_type, FieldType::F64));
1839        // category: fast text
1840        assert!(index.fields[2].fast);
1841        assert!(matches!(index.fields[2].field_type, FieldType::Text));
1842        // count: fast only
1843        assert!(index.fields[3].fast);
1844        assert!(matches!(index.fields[3].field_type, FieldType::U64));
1845        // score: fast i64
1846        assert!(index.fields[4].fast);
1847        assert!(matches!(index.fields[4].field_type, FieldType::I64));
1848
1849        // Verify schema roundtrip preserves fast flag
1850        let schema = index.to_schema();
1851        let price_field = schema.get_field("price").unwrap();
1852        assert!(schema.get_field_entry(price_field).unwrap().fast);
1853
1854        let category_field = schema.get_field("category").unwrap();
1855        assert!(schema.get_field_entry(category_field).unwrap().fast);
1856
1857        let name_field = schema.get_field("name").unwrap();
1858        assert!(!schema.get_field_entry(name_field).unwrap().fast);
1859    }
1860
1861    #[test]
1862    fn test_primary_attribute() {
1863        let sdl = r#"
1864            index documents {
1865                field id: text [primary, stored]
1866                field title: text [indexed, stored]
1867            }
1868        "#;
1869
1870        let indexes = parse_sdl(sdl).unwrap();
1871        assert_eq!(indexes.len(), 1);
1872        let index = &indexes[0];
1873        assert_eq!(index.fields.len(), 2);
1874
1875        // id should be primary, and auto-set fast + indexed
1876        let id_field = &index.fields[0];
1877        assert!(id_field.primary, "id should be primary");
1878        assert!(id_field.fast, "primary implies fast");
1879        assert!(id_field.indexed, "primary implies indexed");
1880
1881        // title should NOT be primary
1882        assert!(!index.fields[1].primary);
1883
1884        // Verify schema conversion preserves primary_key
1885        let schema = index.to_schema();
1886        let id = schema.get_field("id").unwrap();
1887        let id_entry = schema.get_field_entry(id).unwrap();
1888        assert!(id_entry.primary_key);
1889        assert!(id_entry.fast);
1890        assert!(id_entry.indexed);
1891
1892        let title = schema.get_field("title").unwrap();
1893        assert!(!schema.get_field_entry(title).unwrap().primary_key);
1894
1895        // primary_field() should return the primary field
1896        assert_eq!(schema.primary_field(), Some(id));
1897    }
1898
1899    #[test]
1900    fn test_primary_with_other_attributes() {
1901        let sdl = r#"
1902            index documents {
1903                field id: text<simple> [primary, indexed, stored]
1904                field body: text [indexed]
1905            }
1906        "#;
1907
1908        let indexes = parse_sdl(sdl).unwrap();
1909        let id_field = &indexes[0].fields[0];
1910        assert!(id_field.primary);
1911        assert!(id_field.indexed);
1912        assert!(id_field.stored);
1913        assert!(id_field.fast);
1914        assert_eq!(id_field.tokenizer, Some("simple".to_string()));
1915    }
1916
1917    #[test]
1918    fn test_primary_only_one_allowed() {
1919        let sdl = r#"
1920            index documents {
1921                field id: text [primary]
1922                field alt_id: text [primary]
1923            }
1924        "#;
1925
1926        let result = parse_sdl(sdl);
1927        assert!(result.is_err());
1928        let err = result.unwrap_err().to_string();
1929        assert!(
1930            err.contains("primary key"),
1931            "Error should mention primary key: {}",
1932            err
1933        );
1934    }
1935
1936    #[test]
1937    fn test_primary_must_be_text() {
1938        let sdl = r#"
1939            index documents {
1940                field id: u64 [primary]
1941            }
1942        "#;
1943
1944        let result = parse_sdl(sdl);
1945        assert!(result.is_err());
1946        let err = result.unwrap_err().to_string();
1947        assert!(
1948            err.contains("text"),
1949            "Error should mention text type: {}",
1950            err
1951        );
1952    }
1953
1954    #[test]
1955    fn test_primary_cannot_be_multi() {
1956        let sdl = r#"
1957            index documents {
1958                field id: text [primary, stored<multi>]
1959            }
1960        "#;
1961
1962        let result = parse_sdl(sdl);
1963        assert!(result.is_err());
1964        let err = result.unwrap_err().to_string();
1965        assert!(err.contains("multi"), "Error should mention multi: {}", err);
1966    }
1967
1968    #[test]
1969    fn test_no_primary_field() {
1970        // Schema without primary field should work fine
1971        let sdl = r#"
1972            index documents {
1973                field title: text [indexed, stored]
1974            }
1975        "#;
1976
1977        let indexes = parse_sdl(sdl).unwrap();
1978        let schema = indexes[0].to_schema();
1979        assert!(schema.primary_field().is_none());
1980    }
1981}