Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60    WeightQuantization,
61};
62
63/// Parsed field definition
64#[derive(Debug, Clone)]
65pub struct FieldDef {
66    pub name: String,
67    pub field_type: FieldType,
68    pub indexed: bool,
69    pub stored: bool,
70    /// Tokenizer name for text fields (e.g., "simple", "en_stem", "german")
71    pub tokenizer: Option<String>,
72    /// Whether this field can have multiple values (serialized as array in JSON)
73    pub multi: bool,
74    /// Position tracking mode for phrase queries and multi-field element tracking
75    pub positions: Option<super::schema::PositionMode>,
76    /// Configuration for sparse vector fields
77    pub sparse_vector_config: Option<SparseVectorConfig>,
78    /// Configuration for dense vector fields
79    pub dense_vector_config: Option<DenseVectorConfig>,
80    /// Whether this field has columnar fast-field storage
81    pub fast: bool,
82    /// Whether this field is a primary key (unique constraint)
83    pub primary: bool,
84}
85
86/// Parsed index definition
87#[derive(Debug, Clone)]
88pub struct IndexDef {
89    pub name: String,
90    pub fields: Vec<FieldDef>,
91    pub default_fields: Vec<String>,
92    /// Query router rules for routing queries to specific fields
93    pub query_routers: Vec<QueryRouterRule>,
94}
95
96impl IndexDef {
97    /// Convert to a Schema
98    pub fn to_schema(&self) -> Schema {
99        let mut builder = SchemaBuilder::default();
100
101        for field in &self.fields {
102            let f = match field.field_type {
103                FieldType::Text => {
104                    let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
105                    builder.add_text_field_with_tokenizer(
106                        &field.name,
107                        field.indexed,
108                        field.stored,
109                        tokenizer,
110                    )
111                }
112                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
113                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
114                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
115                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
116                FieldType::Json => builder.add_json_field(&field.name, field.stored),
117                FieldType::SparseVector => {
118                    if let Some(config) = &field.sparse_vector_config {
119                        builder.add_sparse_vector_field_with_config(
120                            &field.name,
121                            field.indexed,
122                            field.stored,
123                            config.clone(),
124                        )
125                    } else {
126                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
127                    }
128                }
129                FieldType::DenseVector => {
130                    // Dense vector dimension must be specified via config
131                    let config = field
132                        .dense_vector_config
133                        .as_ref()
134                        .expect("DenseVector field requires dimension to be specified");
135                    builder.add_dense_vector_field_with_config(
136                        &field.name,
137                        field.indexed,
138                        field.stored,
139                        config.clone(),
140                    )
141                }
142            };
143            if field.multi {
144                builder.set_multi(f, true);
145            }
146            if field.fast {
147                builder.set_fast(f, true);
148            }
149            if field.primary {
150                builder.set_primary_key(f);
151            }
152            // Set positions: explicit > auto (ordinal for multi vectors)
153            let positions = field.positions.or({
154                // Auto-set ordinal positions for multi-valued vector fields
155                if field.multi
156                    && matches!(
157                        field.field_type,
158                        FieldType::SparseVector | FieldType::DenseVector
159                    )
160                {
161                    Some(super::schema::PositionMode::Ordinal)
162                } else {
163                    None
164                }
165            });
166            if let Some(mode) = positions {
167                builder.set_positions(f, mode);
168            }
169        }
170
171        // Set default fields if specified
172        if !self.default_fields.is_empty() {
173            builder.set_default_fields(self.default_fields.clone());
174        }
175
176        // Set query routers if specified
177        if !self.query_routers.is_empty() {
178            builder.set_query_routers(self.query_routers.clone());
179        }
180
181        builder.build()
182    }
183
184    /// Create a QueryFieldRouter from the query router rules
185    ///
186    /// Returns None if there are no query router rules defined.
187    /// Returns Err if any regex pattern is invalid.
188    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
189        if self.query_routers.is_empty() {
190            return Ok(None);
191        }
192
193        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
194            .map(Some)
195            .map_err(Error::Schema)
196    }
197}
198
199/// Parse field type from string
200fn parse_field_type(type_str: &str) -> Result<FieldType> {
201    match type_str {
202        "text" | "string" | "str" => Ok(FieldType::Text),
203        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
204        "i64" | "int" | "integer" => Ok(FieldType::I64),
205        "f64" | "float" | "double" => Ok(FieldType::F64),
206        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
207        "json" => Ok(FieldType::Json),
208        "sparse_vector" => Ok(FieldType::SparseVector),
209        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
210        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
211    }
212}
213
214/// Index configuration parsed from indexed<...> attribute
215#[derive(Debug, Clone, Default)]
216struct IndexConfig {
217    index_type: Option<super::schema::VectorIndexType>,
218    num_clusters: Option<usize>,
219    nprobe: Option<usize>,
220    build_threshold: Option<usize>,
221    // Sparse vector index params
222    sparse_format: Option<SparseFormat>,
223    quantization: Option<WeightQuantization>,
224    weight_threshold: Option<f32>,
225    block_size: Option<usize>,
226    pruning: Option<f32>,
227    min_terms: Option<usize>,
228    // Sparse vector query-time config
229    query_tokenizer: Option<String>,
230    query_weighting: Option<QueryWeighting>,
231    query_weight_threshold: Option<f32>,
232    query_max_dims: Option<usize>,
233    query_pruning: Option<f32>,
234    query_min_query_dims: Option<usize>,
235    // BMP fixed dims (vocabulary size) and max weight scale
236    dims: Option<u32>,
237    max_weight: Option<f32>,
238    // Position tracking mode for phrase queries
239    positions: Option<super::schema::PositionMode>,
240}
241
242/// Parse attributes from pest pair
243/// Returns (indexed, stored, multi, fast, primary, index_config)
244/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
245/// multi is now inside stored<multi>
246fn parse_attributes(
247    pair: pest::iterators::Pair<Rule>,
248) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
249    let mut indexed = false;
250    let mut stored = false;
251    let mut multi = false;
252    let mut fast = false;
253    let mut primary = false;
254    let mut index_config = None;
255
256    for attr in pair.into_inner() {
257        if attr.as_rule() == Rule::attribute {
258            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" | "fast" | "primary" }
259            let mut found_config = false;
260            for inner in attr.clone().into_inner() {
261                match inner.as_rule() {
262                    Rule::indexed_with_config => {
263                        indexed = true;
264                        index_config = Some(parse_index_config(inner));
265                        found_config = true;
266                        break;
267                    }
268                    Rule::stored_with_config => {
269                        stored = true;
270                        multi = true; // stored<multi>
271                        found_config = true;
272                        break;
273                    }
274                    _ => {}
275                }
276            }
277            if !found_config {
278                // Simple attribute
279                match attr.as_str() {
280                    "indexed" => indexed = true,
281                    "stored" => stored = true,
282                    "fast" => fast = true,
283                    "primary" => primary = true,
284                    _ => {}
285                }
286            }
287        }
288    }
289
290    (indexed, stored, multi, fast, primary, index_config)
291}
292
293/// Parse index configuration from indexed<...> attribute
294fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
295    let mut config = IndexConfig::default();
296
297    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
298    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
299    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
300
301    for inner in pair.into_inner() {
302        if inner.as_rule() == Rule::index_config_params {
303            for param in inner.into_inner() {
304                if param.as_rule() == Rule::index_config_param {
305                    for p in param.into_inner() {
306                        parse_single_index_config_param(&mut config, p);
307                    }
308                }
309            }
310        }
311    }
312
313    config
314}
315
316/// Parse a single index config parameter
317fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
318    use super::schema::VectorIndexType;
319
320    match p.as_rule() {
321        Rule::index_type_spec => {
322            config.index_type = Some(match p.as_str() {
323                "flat" => VectorIndexType::Flat,
324                "rabitq" => VectorIndexType::RaBitQ,
325                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
326                "scann" => VectorIndexType::ScaNN,
327                _ => VectorIndexType::RaBitQ,
328            });
329        }
330        Rule::index_type_kwarg => {
331            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
332            if let Some(t) = p.into_inner().next() {
333                config.index_type = Some(match t.as_str() {
334                    "flat" => VectorIndexType::Flat,
335                    "rabitq" => VectorIndexType::RaBitQ,
336                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
337                    "scann" => VectorIndexType::ScaNN,
338                    _ => VectorIndexType::RaBitQ,
339                });
340            }
341        }
342        Rule::num_clusters_kwarg => {
343            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
344            if let Some(n) = p.into_inner().next() {
345                config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
346                    log::warn!(
347                        "Invalid num_clusters value '{}', using default 256",
348                        n.as_str()
349                    );
350                    256
351                }));
352            }
353        }
354        Rule::build_threshold_kwarg => {
355            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
356            if let Some(n) = p.into_inner().next() {
357                config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
358                    log::warn!(
359                        "Invalid build_threshold value '{}', using default 10000",
360                        n.as_str()
361                    );
362                    10000
363                }));
364            }
365        }
366        Rule::nprobe_kwarg => {
367            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
368            if let Some(n) = p.into_inner().next() {
369                config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
370                    log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
371                    32
372                }));
373            }
374        }
375        Rule::quantization_kwarg => {
376            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
377            if let Some(q) = p.into_inner().next() {
378                config.quantization = Some(match q.as_str() {
379                    "float32" | "f32" => WeightQuantization::Float32,
380                    "float16" | "f16" => WeightQuantization::Float16,
381                    "uint8" | "u8" => WeightQuantization::UInt8,
382                    "uint4" | "u4" => WeightQuantization::UInt4,
383                    _ => WeightQuantization::default(),
384                });
385            }
386        }
387        Rule::weight_threshold_kwarg => {
388            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
389            if let Some(t) = p.into_inner().next() {
390                config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
391                    log::warn!(
392                        "Invalid weight_threshold value '{}', using default 0.0",
393                        t.as_str()
394                    );
395                    0.0
396                }));
397            }
398        }
399        Rule::block_size_kwarg => {
400            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
401            if let Some(n) = p.into_inner().next() {
402                config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
403                    log::warn!(
404                        "Invalid block_size value '{}', using default 128",
405                        n.as_str()
406                    );
407                    128
408                }));
409            }
410        }
411        Rule::pruning_kwarg => {
412            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
413            if let Some(f) = p.into_inner().next() {
414                config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
415                    log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
416                    1.0
417                }));
418            }
419        }
420        Rule::min_terms_kwarg => {
421            if let Some(n) = p.into_inner().next() {
422                config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
423                    log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
424                    4
425                }));
426            }
427        }
428        Rule::sparse_format_kwarg => {
429            // sparse_format_kwarg = { "format" ~ ":" ~ sparse_format_spec }
430            if let Some(f) = p.into_inner().next() {
431                config.sparse_format = Some(match f.as_str() {
432                    "bmp" => SparseFormat::Bmp,
433                    "maxscore" => SparseFormat::MaxScore,
434                    _ => SparseFormat::default(),
435                });
436            }
437        }
438        Rule::sparse_dims_kwarg => {
439            if let Some(n) = p.into_inner().next() {
440                config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
441                    log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
442                    105879
443                }));
444            }
445        }
446        Rule::sparse_max_weight_kwarg => {
447            if let Some(f) = p.into_inner().next() {
448                config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
449                    log::warn!(
450                        "Invalid max_weight value '{}', using default 5.0",
451                        f.as_str()
452                    );
453                    5.0
454                }));
455            }
456        }
457        Rule::query_config_block => {
458            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
459            parse_query_config_block(config, p);
460        }
461        Rule::positions_kwarg => {
462            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
463            use super::schema::PositionMode;
464            config.positions = Some(match p.as_str() {
465                "ordinal" => PositionMode::Ordinal,
466                "token_position" => PositionMode::TokenPosition,
467                _ => PositionMode::Full, // "positions" or any other value defaults to Full
468            });
469        }
470        _ => {}
471    }
472}
473
474/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
475fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
476    for inner in pair.into_inner() {
477        if inner.as_rule() == Rule::query_config_params {
478            for param in inner.into_inner() {
479                if param.as_rule() == Rule::query_config_param {
480                    for p in param.into_inner() {
481                        match p.as_rule() {
482                            Rule::query_tokenizer_kwarg => {
483                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
484                                if let Some(path) = p.into_inner().next()
485                                    && let Some(inner_path) = path.into_inner().next()
486                                {
487                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
488                                }
489                            }
490                            Rule::query_weighting_kwarg => {
491                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
492                                if let Some(w) = p.into_inner().next() {
493                                    config.query_weighting = Some(match w.as_str() {
494                                        "one" => QueryWeighting::One,
495                                        "idf" => QueryWeighting::Idf,
496                                        "idf_file" => QueryWeighting::IdfFile,
497                                        _ => QueryWeighting::One,
498                                    });
499                                }
500                            }
501                            Rule::query_weight_threshold_kwarg => {
502                                if let Some(t) = p.into_inner().next() {
503                                    config.query_weight_threshold =
504                                        Some(t.as_str().parse().unwrap_or_else(|_| {
505                                            log::warn!(
506                                                "Invalid query weight_threshold '{}', using 0.0",
507                                                t.as_str()
508                                            );
509                                            0.0
510                                        }));
511                                }
512                            }
513                            Rule::query_max_dims_kwarg => {
514                                if let Some(t) = p.into_inner().next() {
515                                    config.query_max_dims =
516                                        Some(t.as_str().parse().unwrap_or_else(|_| {
517                                            log::warn!(
518                                                "Invalid query max_dims '{}', using 0",
519                                                t.as_str()
520                                            );
521                                            0
522                                        }));
523                                }
524                            }
525                            Rule::query_pruning_kwarg => {
526                                if let Some(t) = p.into_inner().next() {
527                                    config.query_pruning =
528                                        Some(t.as_str().parse().unwrap_or_else(|_| {
529                                            log::warn!(
530                                                "Invalid query pruning '{}', using 1.0",
531                                                t.as_str()
532                                            );
533                                            1.0
534                                        }));
535                                }
536                            }
537                            Rule::query_min_query_dims_kwarg => {
538                                if let Some(t) = p.into_inner().next() {
539                                    config.query_min_query_dims =
540                                        Some(t.as_str().parse().unwrap_or_else(|_| {
541                                            log::warn!(
542                                                "Invalid query min_query_dims '{}', using 4",
543                                                t.as_str()
544                                            );
545                                            4
546                                        }));
547                                }
548                            }
549                            _ => {}
550                        }
551                    }
552                }
553            }
554        }
555    }
556}
557
558/// Parse a field definition from pest pair
559fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
560    let mut inner = pair.into_inner();
561
562    let name = inner
563        .next()
564        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
565        .as_str()
566        .to_string();
567
568    let field_type_str = inner
569        .next()
570        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
571        .as_str();
572
573    let field_type = parse_field_type(field_type_str)?;
574
575    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
576    let mut tokenizer = None;
577    let mut sparse_vector_config = None;
578    let mut dense_vector_config = None;
579    let mut indexed = true;
580    let mut stored = true;
581    let mut multi = false;
582    let mut fast = false;
583    let mut primary = false;
584    let mut index_config: Option<IndexConfig> = None;
585
586    for item in inner {
587        match item.as_rule() {
588            Rule::tokenizer_spec => {
589                // Extract tokenizer name from <name>
590                if let Some(tok_name) = item.into_inner().next() {
591                    tokenizer = Some(tok_name.as_str().to_string());
592                }
593            }
594            Rule::sparse_vector_config => {
595                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
596                sparse_vector_config = Some(parse_sparse_vector_config(item));
597            }
598            Rule::dense_vector_config => {
599                // Parse dense_vector_params (keyword or positional) - only dims
600                dense_vector_config = Some(parse_dense_vector_config(item));
601            }
602            Rule::attributes => {
603                let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
604                indexed = idx;
605                stored = sto;
606                multi = mul;
607                fast = fst;
608                primary = pri;
609                index_config = idx_cfg;
610            }
611            _ => {}
612        }
613    }
614
615    // Primary key implies fast + indexed (needed for dedup lookups)
616    if primary {
617        fast = true;
618        indexed = true;
619    }
620
621    // Merge index config into vector configs if both exist
622    let mut positions = None;
623    if let Some(idx_cfg) = index_config {
624        positions = idx_cfg.positions;
625        if let Some(ref mut dv_config) = dense_vector_config {
626            apply_index_config_to_dense_vector(dv_config, idx_cfg);
627        } else if field_type == FieldType::SparseVector {
628            // For sparse vectors, create default config if not present and apply index params
629            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
630            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
631        }
632    }
633
634    Ok(FieldDef {
635        name,
636        field_type,
637        indexed,
638        stored,
639        tokenizer,
640        multi,
641        positions,
642        sparse_vector_config,
643        dense_vector_config,
644        fast,
645        primary,
646    })
647}
648
649/// Apply index configuration from indexed<...> to DenseVectorConfig
650fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
651    // Apply index type if specified
652    if let Some(index_type) = idx_cfg.index_type {
653        config.index_type = index_type;
654    }
655
656    // Apply num_clusters for IVF-based indexes
657    if idx_cfg.num_clusters.is_some() {
658        config.num_clusters = idx_cfg.num_clusters;
659    }
660
661    // Apply nprobe if specified
662    if let Some(nprobe) = idx_cfg.nprobe {
663        config.nprobe = nprobe;
664    }
665
666    // Apply build_threshold if specified
667    if idx_cfg.build_threshold.is_some() {
668        config.build_threshold = idx_cfg.build_threshold;
669    }
670}
671
672/// Parse sparse_vector_config - only index_size (positional)
673/// Example: <u16> or <u32>
674fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
675    let mut index_size = IndexSize::default();
676
677    // Parse positional index_size_spec
678    for inner in pair.into_inner() {
679        if inner.as_rule() == Rule::index_size_spec {
680            index_size = match inner.as_str() {
681                "u16" => IndexSize::U16,
682                "u32" => IndexSize::U32,
683                _ => IndexSize::default(),
684            };
685        }
686    }
687
688    SparseVectorConfig {
689        format: SparseFormat::default(),
690        index_size,
691        weight_quantization: WeightQuantization::default(),
692        weight_threshold: 0.0,
693        block_size: 128,
694        bmp_block_size: 64,
695        max_bmp_grid_bytes: 0,
696        bmp_superblock_size: 64,
697        pruning: None,
698        query_config: None,
699        dims: None,
700        max_weight: None,
701        min_terms: 4,
702    }
703}
704
705/// Apply index configuration from indexed<...> to SparseVectorConfig
706fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
707    if let Some(f) = idx_cfg.sparse_format {
708        config.format = f;
709    }
710    if let Some(q) = idx_cfg.quantization {
711        config.weight_quantization = q;
712    }
713    if let Some(t) = idx_cfg.weight_threshold {
714        config.weight_threshold = t;
715    }
716    if let Some(bs) = idx_cfg.block_size {
717        let adjusted = bs.next_power_of_two();
718        if adjusted != bs {
719            log::warn!(
720                "block_size {} adjusted to next power of two: {}",
721                bs,
722                adjusted
723            );
724        }
725        config.block_size = adjusted;
726    }
727    if let Some(p) = idx_cfg.pruning {
728        let clamped = p.clamp(0.0, 1.0);
729        if (clamped - p).abs() > f32::EPSILON {
730            log::warn!(
731                "pruning {} clamped to valid range [0.0, 1.0]: {}",
732                p,
733                clamped
734            );
735        }
736        config.pruning = Some(clamped);
737    }
738    if let Some(mt) = idx_cfg.min_terms {
739        config.min_terms = mt;
740    }
741    if let Some(d) = idx_cfg.dims {
742        config.dims = Some(d);
743    }
744    if let Some(mw) = idx_cfg.max_weight {
745        config.max_weight = Some(mw);
746    }
747    // Apply query-time configuration if present
748    if idx_cfg.query_tokenizer.is_some()
749        || idx_cfg.query_weighting.is_some()
750        || idx_cfg.query_weight_threshold.is_some()
751        || idx_cfg.query_max_dims.is_some()
752        || idx_cfg.query_pruning.is_some()
753        || idx_cfg.query_min_query_dims.is_some()
754    {
755        let query_config = config
756            .query_config
757            .get_or_insert(SparseQueryConfig::default());
758        if let Some(tokenizer) = idx_cfg.query_tokenizer {
759            query_config.tokenizer = Some(tokenizer);
760        }
761        if let Some(weighting) = idx_cfg.query_weighting {
762            query_config.weighting = weighting;
763        }
764        if let Some(t) = idx_cfg.query_weight_threshold {
765            query_config.weight_threshold = t;
766        }
767        if let Some(d) = idx_cfg.query_max_dims {
768            query_config.max_query_dims = Some(d);
769        }
770        if let Some(p) = idx_cfg.query_pruning {
771            query_config.pruning = Some(p);
772        }
773        if let Some(m) = idx_cfg.query_min_query_dims {
774            query_config.min_query_dims = m;
775        }
776    }
777}
778
779/// Parse dense_vector_config - dims and optional quantization type
780/// All index-related params are in indexed<...> attribute
781fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
782    let mut dim: usize = 0;
783    let mut quantization = DenseVectorQuantization::F32;
784
785    // Navigate to dense_vector_params
786    for params in pair.into_inner() {
787        if params.as_rule() == Rule::dense_vector_params {
788            for inner in params.into_inner() {
789                match inner.as_rule() {
790                    Rule::dense_vector_keyword_params => {
791                        for kwarg in inner.into_inner() {
792                            match kwarg.as_rule() {
793                                Rule::dims_kwarg => {
794                                    if let Some(d) = kwarg.into_inner().next() {
795                                        dim = d.as_str().parse().unwrap_or(0);
796                                    }
797                                }
798                                Rule::quant_type_spec => {
799                                    quantization = parse_quant_type(kwarg.as_str());
800                                }
801                                _ => {}
802                            }
803                        }
804                    }
805                    Rule::dense_vector_positional_params => {
806                        for item in inner.into_inner() {
807                            match item.as_rule() {
808                                Rule::dimension_spec => {
809                                    dim = item.as_str().parse().unwrap_or(0);
810                                }
811                                Rule::quant_type_spec => {
812                                    quantization = parse_quant_type(item.as_str());
813                                }
814                                _ => {}
815                            }
816                        }
817                    }
818                    _ => {}
819                }
820            }
821        }
822    }
823
824    DenseVectorConfig::new(dim).with_quantization(quantization)
825}
826
827fn parse_quant_type(s: &str) -> DenseVectorQuantization {
828    match s.trim() {
829        "f16" => DenseVectorQuantization::F16,
830        "uint8" | "u8" => DenseVectorQuantization::UInt8,
831        _ => DenseVectorQuantization::F32,
832    }
833}
834
835/// Parse default_fields definition
836fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
837    pair.into_inner().map(|p| p.as_str().to_string()).collect()
838}
839
840/// Parse a query router definition
841fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
842    let mut pattern = String::new();
843    let mut substitution = String::new();
844    let mut target_field = String::new();
845    let mut mode = RoutingMode::Additional;
846
847    for prop in pair.into_inner() {
848        if prop.as_rule() != Rule::query_router_prop {
849            continue;
850        }
851
852        for inner in prop.into_inner() {
853            match inner.as_rule() {
854                Rule::query_router_pattern => {
855                    if let Some(regex_str) = inner.into_inner().next() {
856                        pattern = parse_string_value(regex_str);
857                    }
858                }
859                Rule::query_router_substitution => {
860                    if let Some(quoted) = inner.into_inner().next() {
861                        substitution = parse_string_value(quoted);
862                    }
863                }
864                Rule::query_router_target => {
865                    if let Some(ident) = inner.into_inner().next() {
866                        target_field = ident.as_str().to_string();
867                    }
868                }
869                Rule::query_router_mode => {
870                    if let Some(mode_val) = inner.into_inner().next() {
871                        mode = match mode_val.as_str() {
872                            "exclusive" => RoutingMode::Exclusive,
873                            "additional" => RoutingMode::Additional,
874                            _ => RoutingMode::Additional,
875                        };
876                    }
877                }
878                _ => {}
879            }
880        }
881    }
882
883    if pattern.is_empty() {
884        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
885    }
886    if substitution.is_empty() {
887        return Err(Error::Schema(
888            "query_router missing 'substitution'".to_string(),
889        ));
890    }
891    if target_field.is_empty() {
892        return Err(Error::Schema(
893            "query_router missing 'target_field'".to_string(),
894        ));
895    }
896
897    Ok(QueryRouterRule {
898        pattern,
899        substitution,
900        target_field,
901        mode,
902    })
903}
904
905/// Parse a string value from quoted_string, raw_string, or regex_string
906fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
907    let s = pair.as_str();
908    match pair.as_rule() {
909        Rule::regex_string => {
910            // regex_string contains either raw_string or quoted_string
911            if let Some(inner) = pair.into_inner().next() {
912                parse_string_value(inner)
913            } else {
914                s.to_string()
915            }
916        }
917        Rule::raw_string => {
918            // r"..." - strip r" prefix and " suffix
919            s[2..s.len() - 1].to_string()
920        }
921        Rule::quoted_string => {
922            // "..." - strip quotes and handle escapes
923            let inner = &s[1..s.len() - 1];
924            // Simple escape handling
925            inner
926                .replace("\\n", "\n")
927                .replace("\\t", "\t")
928                .replace("\\\"", "\"")
929                .replace("\\\\", "\\")
930        }
931        _ => s.to_string(),
932    }
933}
934
935/// Parse an index definition from pest pair
936fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
937    let mut inner = pair.into_inner();
938
939    let name = inner
940        .next()
941        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
942        .as_str()
943        .to_string();
944
945    let mut fields = Vec::new();
946    let mut default_fields = Vec::new();
947    let mut query_routers = Vec::new();
948
949    for item in inner {
950        match item.as_rule() {
951            Rule::field_def => {
952                fields.push(parse_field_def(item)?);
953            }
954            Rule::default_fields_def => {
955                default_fields = parse_default_fields_def(item);
956            }
957            Rule::query_router_def => {
958                query_routers.push(parse_query_router_def(item)?);
959            }
960            _ => {}
961        }
962    }
963
964    // Validate primary key constraints
965    let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
966    if primary_fields.len() > 1 {
967        return Err(Error::Schema(format!(
968            "Index '{}' has {} primary key fields, but at most one is allowed",
969            name,
970            primary_fields.len()
971        )));
972    }
973    if let Some(pk) = primary_fields.first() {
974        if pk.field_type != FieldType::Text {
975            return Err(Error::Schema(format!(
976                "Primary key field '{}' must be of type text, got {:?}",
977                pk.name, pk.field_type
978            )));
979        }
980        if pk.multi {
981            return Err(Error::Schema(format!(
982                "Primary key field '{}' cannot be multi-valued",
983                pk.name
984            )));
985        }
986    }
987
988    Ok(IndexDef {
989        name,
990        fields,
991        default_fields,
992        query_routers,
993    })
994}
995
996/// Parse SDL from a string
997pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
998    let pairs = SdlParser::parse(Rule::file, input)
999        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1000
1001    let mut indexes = Vec::new();
1002
1003    for pair in pairs {
1004        if pair.as_rule() == Rule::file {
1005            for inner in pair.into_inner() {
1006                if inner.as_rule() == Rule::index_def {
1007                    indexes.push(parse_index_def(inner)?);
1008                }
1009            }
1010        }
1011    }
1012
1013    Ok(indexes)
1014}
1015
1016/// Parse SDL and return a single index definition
1017pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1018    let indexes = parse_sdl(input)?;
1019
1020    if indexes.is_empty() {
1021        return Err(Error::Schema("No index definition found".to_string()));
1022    }
1023
1024    if indexes.len() > 1 {
1025        return Err(Error::Schema(
1026            "Multiple index definitions found, expected one".to_string(),
1027        ));
1028    }
1029
1030    Ok(indexes.into_iter().next().unwrap())
1031}
1032
1033#[cfg(test)]
1034mod tests {
1035    use super::*;
1036
1037    #[test]
1038    fn test_parse_simple_schema() {
1039        let sdl = r#"
1040            index articles {
1041                field title: text [indexed, stored]
1042                field body: text [indexed]
1043            }
1044        "#;
1045
1046        let indexes = parse_sdl(sdl).unwrap();
1047        assert_eq!(indexes.len(), 1);
1048
1049        let index = &indexes[0];
1050        assert_eq!(index.name, "articles");
1051        assert_eq!(index.fields.len(), 2);
1052
1053        assert_eq!(index.fields[0].name, "title");
1054        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1055        assert!(index.fields[0].indexed);
1056        assert!(index.fields[0].stored);
1057
1058        assert_eq!(index.fields[1].name, "body");
1059        assert!(matches!(index.fields[1].field_type, FieldType::Text));
1060        assert!(index.fields[1].indexed);
1061        assert!(!index.fields[1].stored);
1062    }
1063
1064    #[test]
1065    fn test_parse_all_field_types() {
1066        let sdl = r#"
1067            index test {
1068                field text_field: text [indexed, stored]
1069                field u64_field: u64 [indexed, stored]
1070                field i64_field: i64 [indexed, stored]
1071                field f64_field: f64 [indexed, stored]
1072                field bytes_field: bytes [stored]
1073            }
1074        "#;
1075
1076        let indexes = parse_sdl(sdl).unwrap();
1077        let index = &indexes[0];
1078
1079        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1080        assert!(matches!(index.fields[1].field_type, FieldType::U64));
1081        assert!(matches!(index.fields[2].field_type, FieldType::I64));
1082        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1083        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1084    }
1085
1086    #[test]
1087    fn test_parse_with_comments() {
1088        let sdl = r#"
1089            # This is a comment
1090            index articles {
1091                # Title field
1092                field title: text [indexed, stored]
1093                field body: text [indexed] # inline comment not supported yet
1094            }
1095        "#;
1096
1097        let indexes = parse_sdl(sdl).unwrap();
1098        assert_eq!(indexes[0].fields.len(), 2);
1099    }
1100
1101    #[test]
1102    fn test_parse_type_aliases() {
1103        let sdl = r#"
1104            index test {
1105                field a: string [indexed]
1106                field b: int [indexed]
1107                field c: uint [indexed]
1108                field d: float [indexed]
1109                field e: binary [stored]
1110            }
1111        "#;
1112
1113        let indexes = parse_sdl(sdl).unwrap();
1114        let index = &indexes[0];
1115
1116        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1117        assert!(matches!(index.fields[1].field_type, FieldType::I64));
1118        assert!(matches!(index.fields[2].field_type, FieldType::U64));
1119        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1120        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1121    }
1122
1123    #[test]
1124    fn test_to_schema() {
1125        let sdl = r#"
1126            index articles {
1127                field title: text [indexed, stored]
1128                field views: u64 [indexed, stored]
1129            }
1130        "#;
1131
1132        let indexes = parse_sdl(sdl).unwrap();
1133        let schema = indexes[0].to_schema();
1134
1135        assert!(schema.get_field("title").is_some());
1136        assert!(schema.get_field("views").is_some());
1137        assert!(schema.get_field("nonexistent").is_none());
1138    }
1139
1140    #[test]
1141    fn test_default_attributes() {
1142        let sdl = r#"
1143            index test {
1144                field title: text
1145            }
1146        "#;
1147
1148        let indexes = parse_sdl(sdl).unwrap();
1149        let field = &indexes[0].fields[0];
1150
1151        // Default should be indexed and stored
1152        assert!(field.indexed);
1153        assert!(field.stored);
1154    }
1155
1156    #[test]
1157    fn test_multiple_indexes() {
1158        let sdl = r#"
1159            index articles {
1160                field title: text [indexed, stored]
1161            }
1162
1163            index users {
1164                field name: text [indexed, stored]
1165                field email: text [indexed, stored]
1166            }
1167        "#;
1168
1169        let indexes = parse_sdl(sdl).unwrap();
1170        assert_eq!(indexes.len(), 2);
1171        assert_eq!(indexes[0].name, "articles");
1172        assert_eq!(indexes[1].name, "users");
1173    }
1174
1175    #[test]
1176    fn test_tokenizer_spec() {
1177        let sdl = r#"
1178            index articles {
1179                field title: text<en_stem> [indexed, stored]
1180                field body: text<simple> [indexed]
1181                field author: text [indexed, stored]
1182            }
1183        "#;
1184
1185        let indexes = parse_sdl(sdl).unwrap();
1186        let index = &indexes[0];
1187
1188        assert_eq!(index.fields[0].name, "title");
1189        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1190
1191        assert_eq!(index.fields[1].name, "body");
1192        assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1193
1194        assert_eq!(index.fields[2].name, "author");
1195        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1196    }
1197
1198    #[test]
1199    fn test_tokenizer_in_schema() {
1200        let sdl = r#"
1201            index articles {
1202                field title: text<german> [indexed, stored]
1203                field body: text<en_stem> [indexed]
1204            }
1205        "#;
1206
1207        let indexes = parse_sdl(sdl).unwrap();
1208        let schema = indexes[0].to_schema();
1209
1210        let title_field = schema.get_field("title").unwrap();
1211        let title_entry = schema.get_field_entry(title_field).unwrap();
1212        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1213
1214        let body_field = schema.get_field("body").unwrap();
1215        let body_entry = schema.get_field_entry(body_field).unwrap();
1216        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1217    }
1218
1219    #[test]
1220    fn test_query_router_basic() {
1221        let sdl = r#"
1222            index documents {
1223                field title: text [indexed, stored]
1224                field uri: text [indexed, stored]
1225
1226                query_router {
1227                    pattern: "10\\.\\d{4,}/[^\\s]+"
1228                    substitution: "doi://{0}"
1229                    target_field: uris
1230                    mode: exclusive
1231                }
1232            }
1233        "#;
1234
1235        let indexes = parse_sdl(sdl).unwrap();
1236        let index = &indexes[0];
1237
1238        assert_eq!(index.query_routers.len(), 1);
1239        let router = &index.query_routers[0];
1240        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1241        assert_eq!(router.substitution, "doi://{0}");
1242        assert_eq!(router.target_field, "uris");
1243        assert_eq!(router.mode, RoutingMode::Exclusive);
1244    }
1245
1246    #[test]
1247    fn test_query_router_raw_string() {
1248        let sdl = r#"
1249            index documents {
1250                field uris: text [indexed, stored]
1251
1252                query_router {
1253                    pattern: r"^pmid:(\d+)$"
1254                    substitution: "pubmed://{1}"
1255                    target_field: uris
1256                    mode: additional
1257                }
1258            }
1259        "#;
1260
1261        let indexes = parse_sdl(sdl).unwrap();
1262        let router = &indexes[0].query_routers[0];
1263
1264        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1265        assert_eq!(router.substitution, "pubmed://{1}");
1266        assert_eq!(router.mode, RoutingMode::Additional);
1267    }
1268
1269    #[test]
1270    fn test_multiple_query_routers() {
1271        let sdl = r#"
1272            index documents {
1273                field uris: text [indexed, stored]
1274
1275                query_router {
1276                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1277                    substitution: "doi://{1}"
1278                    target_field: uris
1279                    mode: exclusive
1280                }
1281
1282                query_router {
1283                    pattern: r"^pmid:(\d+)$"
1284                    substitution: "pubmed://{1}"
1285                    target_field: uris
1286                    mode: exclusive
1287                }
1288
1289                query_router {
1290                    pattern: r"^arxiv:(\d+\.\d+)$"
1291                    substitution: "arxiv://{1}"
1292                    target_field: uris
1293                    mode: additional
1294                }
1295            }
1296        "#;
1297
1298        let indexes = parse_sdl(sdl).unwrap();
1299        assert_eq!(indexes[0].query_routers.len(), 3);
1300    }
1301
1302    #[test]
1303    fn test_query_router_default_mode() {
1304        let sdl = r#"
1305            index documents {
1306                field uris: text [indexed, stored]
1307
1308                query_router {
1309                    pattern: r"test"
1310                    substitution: "{0}"
1311                    target_field: uris
1312                }
1313            }
1314        "#;
1315
1316        let indexes = parse_sdl(sdl).unwrap();
1317        // Default mode should be Additional
1318        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1319    }
1320
1321    #[test]
1322    fn test_multi_attribute() {
1323        let sdl = r#"
1324            index documents {
1325                field uris: text [indexed, stored<multi>]
1326                field title: text [indexed, stored]
1327            }
1328        "#;
1329
1330        let indexes = parse_sdl(sdl).unwrap();
1331        assert_eq!(indexes.len(), 1);
1332
1333        let fields = &indexes[0].fields;
1334        assert_eq!(fields.len(), 2);
1335
1336        // uris should have multi=true
1337        assert_eq!(fields[0].name, "uris");
1338        assert!(fields[0].multi, "uris field should have multi=true");
1339
1340        // title should have multi=false
1341        assert_eq!(fields[1].name, "title");
1342        assert!(!fields[1].multi, "title field should have multi=false");
1343
1344        // Verify schema conversion preserves multi attribute
1345        let schema = indexes[0].to_schema();
1346        let uris_field = schema.get_field("uris").unwrap();
1347        let title_field = schema.get_field("title").unwrap();
1348
1349        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1350        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1351    }
1352
1353    #[test]
1354    fn test_sparse_vector_field() {
1355        let sdl = r#"
1356            index documents {
1357                field embedding: sparse_vector [indexed, stored]
1358            }
1359        "#;
1360
1361        let indexes = parse_sdl(sdl).unwrap();
1362        assert_eq!(indexes.len(), 1);
1363        assert_eq!(indexes[0].fields.len(), 1);
1364        assert_eq!(indexes[0].fields[0].name, "embedding");
1365        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1366        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1367    }
1368
1369    #[test]
1370    fn test_sparse_vector_with_config() {
1371        let sdl = r#"
1372            index documents {
1373                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1374                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1375            }
1376        "#;
1377
1378        let indexes = parse_sdl(sdl).unwrap();
1379        assert_eq!(indexes[0].fields.len(), 2);
1380
1381        // First field: u16 indices, uint8 quantization
1382        let f1 = &indexes[0].fields[0];
1383        assert_eq!(f1.name, "embedding");
1384        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1385        assert_eq!(config1.index_size, IndexSize::U16);
1386        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1387
1388        // Second field: u32 indices, float32 quantization
1389        let f2 = &indexes[0].fields[1];
1390        assert_eq!(f2.name, "dense");
1391        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1392        assert_eq!(config2.index_size, IndexSize::U32);
1393        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1394    }
1395
1396    #[test]
1397    fn test_sparse_vector_with_weight_threshold() {
1398        let sdl = r#"
1399            index documents {
1400                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1401                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1402            }
1403        "#;
1404
1405        let indexes = parse_sdl(sdl).unwrap();
1406        assert_eq!(indexes[0].fields.len(), 2);
1407
1408        // First field: u16 indices, uint8 quantization, threshold 0.1
1409        let f1 = &indexes[0].fields[0];
1410        assert_eq!(f1.name, "embedding");
1411        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1412        assert_eq!(config1.index_size, IndexSize::U16);
1413        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1414        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1415
1416        // Second field: u32 indices, float16 quantization, threshold 0.05
1417        let f2 = &indexes[0].fields[1];
1418        assert_eq!(f2.name, "embedding2");
1419        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1420        assert_eq!(config2.index_size, IndexSize::U32);
1421        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1422        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1423    }
1424
1425    #[test]
1426    fn test_sparse_vector_with_pruning() {
1427        let sdl = r#"
1428            index documents {
1429                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1430            }
1431        "#;
1432
1433        let indexes = parse_sdl(sdl).unwrap();
1434        let f = &indexes[0].fields[0];
1435        assert_eq!(f.name, "embedding");
1436        let config = f.sparse_vector_config.as_ref().unwrap();
1437        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1438        assert_eq!(config.pruning, Some(0.1));
1439    }
1440
1441    #[test]
1442    fn test_dense_vector_field() {
1443        let sdl = r#"
1444            index documents {
1445                field embedding: dense_vector<768> [indexed, stored]
1446            }
1447        "#;
1448
1449        let indexes = parse_sdl(sdl).unwrap();
1450        assert_eq!(indexes.len(), 1);
1451        assert_eq!(indexes[0].fields.len(), 1);
1452
1453        let f = &indexes[0].fields[0];
1454        assert_eq!(f.name, "embedding");
1455        assert_eq!(f.field_type, FieldType::DenseVector);
1456
1457        let config = f.dense_vector_config.as_ref().unwrap();
1458        assert_eq!(config.dim, 768);
1459    }
1460
1461    #[test]
1462    fn test_dense_vector_alias() {
1463        let sdl = r#"
1464            index documents {
1465                field embedding: vector<1536> [indexed]
1466            }
1467        "#;
1468
1469        let indexes = parse_sdl(sdl).unwrap();
1470        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1471        assert_eq!(
1472            indexes[0].fields[0]
1473                .dense_vector_config
1474                .as_ref()
1475                .unwrap()
1476                .dim,
1477            1536
1478        );
1479    }
1480
1481    #[test]
1482    fn test_dense_vector_with_num_clusters() {
1483        let sdl = r#"
1484            index documents {
1485                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1486            }
1487        "#;
1488
1489        let indexes = parse_sdl(sdl).unwrap();
1490        assert_eq!(indexes.len(), 1);
1491
1492        let f = &indexes[0].fields[0];
1493        assert_eq!(f.name, "embedding");
1494        assert_eq!(f.field_type, FieldType::DenseVector);
1495
1496        let config = f.dense_vector_config.as_ref().unwrap();
1497        assert_eq!(config.dim, 768);
1498        assert_eq!(config.num_clusters, Some(256));
1499        assert_eq!(config.nprobe, 32); // default
1500    }
1501
1502    #[test]
1503    fn test_dense_vector_with_num_clusters_and_nprobe() {
1504        let sdl = r#"
1505            index documents {
1506                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1507            }
1508        "#;
1509
1510        let indexes = parse_sdl(sdl).unwrap();
1511        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1512
1513        assert_eq!(config.dim, 1536);
1514        assert_eq!(config.num_clusters, Some(512));
1515        assert_eq!(config.nprobe, 64);
1516    }
1517
1518    #[test]
1519    fn test_dense_vector_keyword_syntax() {
1520        let sdl = r#"
1521            index documents {
1522                field embedding: dense_vector<dims: 1536> [indexed, stored]
1523            }
1524        "#;
1525
1526        let indexes = parse_sdl(sdl).unwrap();
1527        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1528
1529        assert_eq!(config.dim, 1536);
1530        assert!(config.num_clusters.is_none());
1531    }
1532
1533    #[test]
1534    fn test_dense_vector_keyword_syntax_full() {
1535        let sdl = r#"
1536            index documents {
1537                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1538            }
1539        "#;
1540
1541        let indexes = parse_sdl(sdl).unwrap();
1542        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1543
1544        assert_eq!(config.dim, 1536);
1545        assert_eq!(config.num_clusters, Some(256));
1546        assert_eq!(config.nprobe, 64);
1547    }
1548
1549    #[test]
1550    fn test_dense_vector_keyword_syntax_partial() {
1551        let sdl = r#"
1552            index documents {
1553                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1554            }
1555        "#;
1556
1557        let indexes = parse_sdl(sdl).unwrap();
1558        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1559
1560        assert_eq!(config.dim, 768);
1561        assert_eq!(config.num_clusters, Some(128));
1562        assert_eq!(config.nprobe, 32); // default
1563    }
1564
1565    #[test]
1566    fn test_dense_vector_scann_index() {
1567        use crate::dsl::schema::VectorIndexType;
1568
1569        let sdl = r#"
1570            index documents {
1571                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1572            }
1573        "#;
1574
1575        let indexes = parse_sdl(sdl).unwrap();
1576        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1577
1578        assert_eq!(config.dim, 768);
1579        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1580        assert_eq!(config.num_clusters, Some(256));
1581        assert_eq!(config.nprobe, 64);
1582    }
1583
1584    #[test]
1585    fn test_dense_vector_ivf_rabitq_index() {
1586        use crate::dsl::schema::VectorIndexType;
1587
1588        let sdl = r#"
1589            index documents {
1590                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1591            }
1592        "#;
1593
1594        let indexes = parse_sdl(sdl).unwrap();
1595        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1596
1597        assert_eq!(config.dim, 1536);
1598        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1599        assert_eq!(config.num_clusters, Some(512));
1600    }
1601
1602    #[test]
1603    fn test_dense_vector_rabitq_no_clusters() {
1604        use crate::dsl::schema::VectorIndexType;
1605
1606        let sdl = r#"
1607            index documents {
1608                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1609            }
1610        "#;
1611
1612        let indexes = parse_sdl(sdl).unwrap();
1613        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1614
1615        assert_eq!(config.dim, 768);
1616        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1617        assert!(config.num_clusters.is_none());
1618    }
1619
1620    #[test]
1621    fn test_dense_vector_flat_index() {
1622        use crate::dsl::schema::VectorIndexType;
1623
1624        let sdl = r#"
1625            index documents {
1626                field embedding: dense_vector<dims: 768> [indexed<flat>]
1627            }
1628        "#;
1629
1630        let indexes = parse_sdl(sdl).unwrap();
1631        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1632
1633        assert_eq!(config.dim, 768);
1634        assert_eq!(config.index_type, VectorIndexType::Flat);
1635    }
1636
1637    #[test]
1638    fn test_dense_vector_default_index_type() {
1639        use crate::dsl::schema::VectorIndexType;
1640
1641        // When no index type specified, should default to RaBitQ (basic)
1642        let sdl = r#"
1643            index documents {
1644                field embedding: dense_vector<dims: 768> [indexed]
1645            }
1646        "#;
1647
1648        let indexes = parse_sdl(sdl).unwrap();
1649        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1650
1651        assert_eq!(config.dim, 768);
1652        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1653    }
1654
1655    #[test]
1656    fn test_dense_vector_f16_quantization() {
1657        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1658
1659        let sdl = r#"
1660            index documents {
1661                field embedding: dense_vector<768, f16> [indexed]
1662            }
1663        "#;
1664
1665        let indexes = parse_sdl(sdl).unwrap();
1666        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1667
1668        assert_eq!(config.dim, 768);
1669        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1670        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1671    }
1672
1673    #[test]
1674    fn test_dense_vector_uint8_quantization() {
1675        use crate::dsl::schema::DenseVectorQuantization;
1676
1677        let sdl = r#"
1678            index documents {
1679                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1680            }
1681        "#;
1682
1683        let indexes = parse_sdl(sdl).unwrap();
1684        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1685
1686        assert_eq!(config.dim, 1024);
1687        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1688    }
1689
1690    #[test]
1691    fn test_dense_vector_u8_alias() {
1692        use crate::dsl::schema::DenseVectorQuantization;
1693
1694        let sdl = r#"
1695            index documents {
1696                field embedding: dense_vector<512, u8> [indexed]
1697            }
1698        "#;
1699
1700        let indexes = parse_sdl(sdl).unwrap();
1701        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1702
1703        assert_eq!(config.dim, 512);
1704        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1705    }
1706
1707    #[test]
1708    fn test_dense_vector_default_f32_quantization() {
1709        use crate::dsl::schema::DenseVectorQuantization;
1710
1711        // No quantization type → default f32
1712        let sdl = r#"
1713            index documents {
1714                field embedding: dense_vector<768> [indexed]
1715            }
1716        "#;
1717
1718        let indexes = parse_sdl(sdl).unwrap();
1719        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1720
1721        assert_eq!(config.dim, 768);
1722        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1723    }
1724
1725    #[test]
1726    fn test_dense_vector_keyword_with_quantization() {
1727        use crate::dsl::schema::DenseVectorQuantization;
1728
1729        let sdl = r#"
1730            index documents {
1731                field embedding: dense_vector<dims: 768, f16> [indexed]
1732            }
1733        "#;
1734
1735        let indexes = parse_sdl(sdl).unwrap();
1736        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1737
1738        assert_eq!(config.dim, 768);
1739        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1740    }
1741
1742    #[test]
1743    fn test_json_field_type() {
1744        let sdl = r#"
1745            index documents {
1746                field title: text [indexed, stored]
1747                field metadata: json [stored]
1748                field extra: json
1749            }
1750        "#;
1751
1752        let indexes = parse_sdl(sdl).unwrap();
1753        let index = &indexes[0];
1754
1755        assert_eq!(index.fields.len(), 3);
1756
1757        // Check JSON field
1758        assert_eq!(index.fields[1].name, "metadata");
1759        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1760        assert!(index.fields[1].stored);
1761        // JSON fields should not be indexed (enforced by add_json_field)
1762
1763        // Check default attributes for JSON field
1764        assert_eq!(index.fields[2].name, "extra");
1765        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1766
1767        // Verify schema conversion
1768        let schema = index.to_schema();
1769        let metadata_field = schema.get_field("metadata").unwrap();
1770        let entry = schema.get_field_entry(metadata_field).unwrap();
1771        assert_eq!(entry.field_type, FieldType::Json);
1772        assert!(!entry.indexed); // JSON fields are never indexed
1773        assert!(entry.stored);
1774    }
1775
1776    #[test]
1777    fn test_sparse_vector_query_config() {
1778        use crate::structures::QueryWeighting;
1779
1780        let sdl = r#"
1781            index documents {
1782                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1783            }
1784        "#;
1785
1786        let indexes = parse_sdl(sdl).unwrap();
1787        let index = &indexes[0];
1788
1789        assert_eq!(index.fields.len(), 1);
1790        assert_eq!(index.fields[0].name, "embedding");
1791        assert!(matches!(
1792            index.fields[0].field_type,
1793            FieldType::SparseVector
1794        ));
1795
1796        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1797        assert_eq!(config.index_size, IndexSize::U16);
1798        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1799
1800        // Check query config
1801        let query_config = config.query_config.as_ref().unwrap();
1802        assert_eq!(
1803            query_config.tokenizer.as_deref(),
1804            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1805        );
1806        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1807
1808        // Verify schema conversion preserves query config
1809        let schema = index.to_schema();
1810        let embedding_field = schema.get_field("embedding").unwrap();
1811        let entry = schema.get_field_entry(embedding_field).unwrap();
1812        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1813        let qc = sv_config.query_config.as_ref().unwrap();
1814        assert_eq!(
1815            qc.tokenizer.as_deref(),
1816            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1817        );
1818        assert_eq!(qc.weighting, QueryWeighting::Idf);
1819    }
1820
1821    #[test]
1822    fn test_sparse_vector_query_config_weighting_one() {
1823        use crate::structures::QueryWeighting;
1824
1825        let sdl = r#"
1826            index documents {
1827                field embedding: sparse_vector [indexed<query<weighting: one>>]
1828            }
1829        "#;
1830
1831        let indexes = parse_sdl(sdl).unwrap();
1832        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1833
1834        let query_config = config.query_config.as_ref().unwrap();
1835        assert!(query_config.tokenizer.is_none());
1836        assert_eq!(query_config.weighting, QueryWeighting::One);
1837    }
1838
1839    #[test]
1840    fn test_sparse_vector_query_config_weighting_idf_file() {
1841        use crate::structures::QueryWeighting;
1842
1843        let sdl = r#"
1844            index documents {
1845                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1846            }
1847        "#;
1848
1849        let indexes = parse_sdl(sdl).unwrap();
1850        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1851
1852        let query_config = config.query_config.as_ref().unwrap();
1853        assert_eq!(
1854            query_config.tokenizer.as_deref(),
1855            Some("opensearch-neural-sparse-encoding-v1")
1856        );
1857        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1858
1859        // Verify schema conversion preserves idf_file
1860        let schema = indexes[0].to_schema();
1861        let field = schema.get_field("embedding").unwrap();
1862        let entry = schema.get_field_entry(field).unwrap();
1863        let sc = entry.sparse_vector_config.as_ref().unwrap();
1864        let qc = sc.query_config.as_ref().unwrap();
1865        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1866    }
1867
1868    #[test]
1869    fn test_sparse_vector_query_config_pruning_params() {
1870        let sdl = r#"
1871            index documents {
1872                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1873            }
1874        "#;
1875
1876        let indexes = parse_sdl(sdl).unwrap();
1877        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1878
1879        let qc = config.query_config.as_ref().unwrap();
1880        assert_eq!(qc.weighting, QueryWeighting::Idf);
1881        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1882        assert_eq!(qc.max_query_dims, Some(25));
1883        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1884
1885        // Verify schema roundtrip
1886        let schema = indexes[0].to_schema();
1887        let field = schema.get_field("embedding").unwrap();
1888        let entry = schema.get_field_entry(field).unwrap();
1889        let sc = entry.sparse_vector_config.as_ref().unwrap();
1890        let rqc = sc.query_config.as_ref().unwrap();
1891        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1892        assert_eq!(rqc.max_query_dims, Some(25));
1893        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1894    }
1895
1896    #[test]
1897    fn test_sparse_vector_format_maxscore() {
1898        let sdl = r#"
1899            index documents {
1900                field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1901            }
1902        "#;
1903
1904        let indexes = parse_sdl(sdl).unwrap();
1905        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1906        assert_eq!(config.format, SparseFormat::MaxScore);
1907        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1908
1909        // Verify schema roundtrip
1910        let schema = indexes[0].to_schema();
1911        let field = schema.get_field("embedding").unwrap();
1912        let entry = schema.get_field_entry(field).unwrap();
1913        let sc = entry.sparse_vector_config.as_ref().unwrap();
1914        assert_eq!(sc.format, SparseFormat::MaxScore);
1915    }
1916
1917    #[test]
1918    fn test_sparse_vector_format_bmp() {
1919        let sdl = r#"
1920            index documents {
1921                field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1922            }
1923        "#;
1924
1925        let indexes = parse_sdl(sdl).unwrap();
1926        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1927        assert_eq!(config.format, SparseFormat::Bmp);
1928    }
1929
1930    #[test]
1931    fn test_fast_attribute() {
1932        let sdl = r#"
1933            index products {
1934                field name: text [indexed, stored]
1935                field price: f64 [indexed, fast]
1936                field category: text [indexed, stored, fast]
1937                field count: u64 [fast]
1938                field score: i64 [indexed, stored, fast]
1939            }
1940        "#;
1941
1942        let indexes = parse_sdl(sdl).unwrap();
1943        assert_eq!(indexes.len(), 1);
1944        let index = &indexes[0];
1945        assert_eq!(index.fields.len(), 5);
1946
1947        // name: no fast
1948        assert!(!index.fields[0].fast);
1949        // price: fast
1950        assert!(index.fields[1].fast);
1951        assert!(matches!(index.fields[1].field_type, FieldType::F64));
1952        // category: fast text
1953        assert!(index.fields[2].fast);
1954        assert!(matches!(index.fields[2].field_type, FieldType::Text));
1955        // count: fast only
1956        assert!(index.fields[3].fast);
1957        assert!(matches!(index.fields[3].field_type, FieldType::U64));
1958        // score: fast i64
1959        assert!(index.fields[4].fast);
1960        assert!(matches!(index.fields[4].field_type, FieldType::I64));
1961
1962        // Verify schema roundtrip preserves fast flag
1963        let schema = index.to_schema();
1964        let price_field = schema.get_field("price").unwrap();
1965        assert!(schema.get_field_entry(price_field).unwrap().fast);
1966
1967        let category_field = schema.get_field("category").unwrap();
1968        assert!(schema.get_field_entry(category_field).unwrap().fast);
1969
1970        let name_field = schema.get_field("name").unwrap();
1971        assert!(!schema.get_field_entry(name_field).unwrap().fast);
1972    }
1973
1974    #[test]
1975    fn test_primary_attribute() {
1976        let sdl = r#"
1977            index documents {
1978                field id: text [primary, stored]
1979                field title: text [indexed, stored]
1980            }
1981        "#;
1982
1983        let indexes = parse_sdl(sdl).unwrap();
1984        assert_eq!(indexes.len(), 1);
1985        let index = &indexes[0];
1986        assert_eq!(index.fields.len(), 2);
1987
1988        // id should be primary, and auto-set fast + indexed
1989        let id_field = &index.fields[0];
1990        assert!(id_field.primary, "id should be primary");
1991        assert!(id_field.fast, "primary implies fast");
1992        assert!(id_field.indexed, "primary implies indexed");
1993
1994        // title should NOT be primary
1995        assert!(!index.fields[1].primary);
1996
1997        // Verify schema conversion preserves primary_key
1998        let schema = index.to_schema();
1999        let id = schema.get_field("id").unwrap();
2000        let id_entry = schema.get_field_entry(id).unwrap();
2001        assert!(id_entry.primary_key);
2002        assert!(id_entry.fast);
2003        assert!(id_entry.indexed);
2004
2005        let title = schema.get_field("title").unwrap();
2006        assert!(!schema.get_field_entry(title).unwrap().primary_key);
2007
2008        // primary_field() should return the primary field
2009        assert_eq!(schema.primary_field(), Some(id));
2010    }
2011
2012    #[test]
2013    fn test_primary_with_other_attributes() {
2014        let sdl = r#"
2015            index documents {
2016                field id: text<simple> [primary, indexed, stored]
2017                field body: text [indexed]
2018            }
2019        "#;
2020
2021        let indexes = parse_sdl(sdl).unwrap();
2022        let id_field = &indexes[0].fields[0];
2023        assert!(id_field.primary);
2024        assert!(id_field.indexed);
2025        assert!(id_field.stored);
2026        assert!(id_field.fast);
2027        assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2028    }
2029
2030    #[test]
2031    fn test_primary_only_one_allowed() {
2032        let sdl = r#"
2033            index documents {
2034                field id: text [primary]
2035                field alt_id: text [primary]
2036            }
2037        "#;
2038
2039        let result = parse_sdl(sdl);
2040        assert!(result.is_err());
2041        let err = result.unwrap_err().to_string();
2042        assert!(
2043            err.contains("primary key"),
2044            "Error should mention primary key: {}",
2045            err
2046        );
2047    }
2048
2049    #[test]
2050    fn test_primary_must_be_text() {
2051        let sdl = r#"
2052            index documents {
2053                field id: u64 [primary]
2054            }
2055        "#;
2056
2057        let result = parse_sdl(sdl);
2058        assert!(result.is_err());
2059        let err = result.unwrap_err().to_string();
2060        assert!(
2061            err.contains("text"),
2062            "Error should mention text type: {}",
2063            err
2064        );
2065    }
2066
2067    #[test]
2068    fn test_primary_cannot_be_multi() {
2069        let sdl = r#"
2070            index documents {
2071                field id: text [primary, stored<multi>]
2072            }
2073        "#;
2074
2075        let result = parse_sdl(sdl);
2076        assert!(result.is_err());
2077        let err = result.unwrap_err().to_string();
2078        assert!(err.contains("multi"), "Error should mention multi: {}", err);
2079    }
2080
2081    #[test]
2082    fn test_no_primary_field() {
2083        // Schema without primary field should work fine
2084        let sdl = r#"
2085            index documents {
2086                field title: text [indexed, stored]
2087            }
2088        "#;
2089
2090        let indexes = parse_sdl(sdl).unwrap();
2091        let schema = indexes[0].to_schema();
2092        assert!(schema.primary_field().is_none());
2093    }
2094}