Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::{BinaryDenseVectorConfig, DenseVectorConfig};
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60    WeightQuantization,
61};
62
63/// Parsed field definition
64#[derive(Debug, Clone)]
65pub struct FieldDef {
66    pub name: String,
67    pub field_type: FieldType,
68    pub indexed: bool,
69    pub stored: bool,
70    /// Tokenizer name for text fields (e.g., "simple", "en_stem", "german")
71    pub tokenizer: Option<String>,
72    /// Whether this field can have multiple values (serialized as array in JSON)
73    pub multi: bool,
74    /// Position tracking mode for phrase queries and multi-field element tracking
75    pub positions: Option<super::schema::PositionMode>,
76    /// Configuration for sparse vector fields
77    pub sparse_vector_config: Option<SparseVectorConfig>,
78    /// Configuration for dense vector fields
79    pub dense_vector_config: Option<DenseVectorConfig>,
80    /// Configuration for binary dense vector fields
81    pub binary_dense_vector_config: Option<BinaryDenseVectorConfig>,
82    /// Whether this field has columnar fast-field storage
83    pub fast: bool,
84    /// Whether this field is a primary key (unique constraint)
85    pub primary: bool,
86    /// Whether build-time document reordering (BP) is enabled for BMP fields
87    pub reorder: bool,
88}
89
90/// Parsed index definition
91#[derive(Debug, Clone)]
92pub struct IndexDef {
93    pub name: String,
94    pub fields: Vec<FieldDef>,
95    pub default_fields: Vec<String>,
96    /// Query router rules for routing queries to specific fields
97    pub query_routers: Vec<QueryRouterRule>,
98}
99
100impl IndexDef {
101    /// Convert to a Schema
102    pub fn to_schema(&self) -> Schema {
103        let mut builder = SchemaBuilder::default();
104
105        for field in &self.fields {
106            let f = match field.field_type {
107                FieldType::Text => {
108                    let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
109                    builder.add_text_field_with_tokenizer(
110                        &field.name,
111                        field.indexed,
112                        field.stored,
113                        tokenizer,
114                    )
115                }
116                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
117                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
118                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
119                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
120                FieldType::Json => builder.add_json_field(&field.name, field.stored),
121                FieldType::SparseVector => {
122                    if let Some(config) = &field.sparse_vector_config {
123                        builder.add_sparse_vector_field_with_config(
124                            &field.name,
125                            field.indexed,
126                            field.stored,
127                            config.clone(),
128                        )
129                    } else {
130                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
131                    }
132                }
133                FieldType::DenseVector => {
134                    // Dense vector dimension must be specified via config
135                    let config = field
136                        .dense_vector_config
137                        .as_ref()
138                        .expect("DenseVector field requires dimension to be specified");
139                    builder.add_dense_vector_field_with_config(
140                        &field.name,
141                        field.indexed,
142                        field.stored,
143                        config.clone(),
144                    )
145                }
146                FieldType::BinaryDenseVector => {
147                    let config = field
148                        .binary_dense_vector_config
149                        .as_ref()
150                        .expect("BinaryDenseVector field requires dimension to be specified");
151                    builder.add_binary_dense_vector_field_with_config(
152                        &field.name,
153                        field.indexed,
154                        field.stored,
155                        config.clone(),
156                    )
157                }
158            };
159            if field.multi {
160                builder.set_multi(f, true);
161            }
162            if field.fast {
163                builder.set_fast(f, true);
164            }
165            if field.primary {
166                builder.set_primary_key(f);
167            }
168            if field.reorder {
169                builder.set_reorder(f, true);
170            }
171            // Set positions: explicit > auto (ordinal for multi vectors)
172            let positions = field.positions.or({
173                // Auto-set ordinal positions for multi-valued vector fields
174                if field.multi
175                    && matches!(
176                        field.field_type,
177                        FieldType::SparseVector
178                            | FieldType::DenseVector
179                            | FieldType::BinaryDenseVector
180                    )
181                {
182                    Some(super::schema::PositionMode::Ordinal)
183                } else {
184                    None
185                }
186            });
187            if let Some(mode) = positions {
188                builder.set_positions(f, mode);
189            }
190        }
191
192        // Set default fields if specified
193        if !self.default_fields.is_empty() {
194            builder.set_default_fields(self.default_fields.clone());
195        }
196
197        // Set query routers if specified
198        if !self.query_routers.is_empty() {
199            builder.set_query_routers(self.query_routers.clone());
200        }
201
202        builder.build()
203    }
204
205    /// Create a QueryFieldRouter from the query router rules
206    ///
207    /// Returns None if there are no query router rules defined.
208    /// Returns Err if any regex pattern is invalid.
209    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
210        if self.query_routers.is_empty() {
211            return Ok(None);
212        }
213
214        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
215            .map(Some)
216            .map_err(Error::Schema)
217    }
218}
219
220/// Parse field type from string
221fn parse_field_type(type_str: &str) -> Result<FieldType> {
222    match type_str {
223        "text" | "string" | "str" => Ok(FieldType::Text),
224        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
225        "i64" | "int" | "integer" => Ok(FieldType::I64),
226        "f64" | "float" | "double" => Ok(FieldType::F64),
227        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
228        "json" => Ok(FieldType::Json),
229        "sparse_vector" => Ok(FieldType::SparseVector),
230        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
231        "binary_dense_vector" | "binary_vector" => Ok(FieldType::BinaryDenseVector),
232        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
233    }
234}
235
236/// Index configuration parsed from indexed<...> attribute
237#[derive(Debug, Clone, Default)]
238struct IndexConfig {
239    index_type: Option<super::schema::VectorIndexType>,
240    num_clusters: Option<usize>,
241    nprobe: Option<usize>,
242    build_threshold: Option<usize>,
243    // Sparse vector index params
244    sparse_format: Option<SparseFormat>,
245    quantization: Option<WeightQuantization>,
246    weight_threshold: Option<f32>,
247    block_size: Option<usize>,
248    pruning: Option<f32>,
249    min_terms: Option<usize>,
250    // Sparse vector query-time config
251    query_tokenizer: Option<String>,
252    query_weighting: Option<QueryWeighting>,
253    query_weight_threshold: Option<f32>,
254    query_max_dims: Option<usize>,
255    query_pruning: Option<f32>,
256    query_min_query_dims: Option<usize>,
257    // BMP fixed dims (vocabulary size) and max weight scale
258    dims: Option<u32>,
259    max_weight: Option<f32>,
260    // Position tracking mode for phrase queries
261    positions: Option<super::schema::PositionMode>,
262}
263
264/// Parsed attributes from SDL field definition
265struct ParsedAttributes {
266    indexed: bool,
267    stored: bool,
268    multi: bool,
269    fast: bool,
270    primary: bool,
271    reorder: bool,
272    index_config: Option<IndexConfig>,
273}
274
275/// Parse attributes from pest pair
276fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> ParsedAttributes {
277    let mut attrs = ParsedAttributes {
278        indexed: false,
279        stored: false,
280        multi: false,
281        fast: false,
282        primary: false,
283        reorder: false,
284        index_config: None,
285    };
286
287    for attr in pair.into_inner() {
288        if attr.as_rule() == Rule::attribute {
289            let mut found_config = false;
290            for inner in attr.clone().into_inner() {
291                match inner.as_rule() {
292                    Rule::indexed_with_config => {
293                        attrs.indexed = true;
294                        attrs.index_config = Some(parse_index_config(inner));
295                        found_config = true;
296                        break;
297                    }
298                    Rule::stored_with_config => {
299                        attrs.stored = true;
300                        attrs.multi = true; // stored<multi>
301                        found_config = true;
302                        break;
303                    }
304                    _ => {}
305                }
306            }
307            if !found_config {
308                match attr.as_str() {
309                    "indexed" => attrs.indexed = true,
310                    "stored" => attrs.stored = true,
311                    "fast" => attrs.fast = true,
312                    "primary" => attrs.primary = true,
313                    "reorder" => attrs.reorder = true,
314                    _ => {}
315                }
316            }
317        }
318    }
319
320    attrs
321}
322
323/// Parse index configuration from indexed<...> attribute
324fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
325    let mut config = IndexConfig::default();
326
327    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
328    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
329    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
330
331    for inner in pair.into_inner() {
332        if inner.as_rule() == Rule::index_config_params {
333            for param in inner.into_inner() {
334                if param.as_rule() == Rule::index_config_param {
335                    for p in param.into_inner() {
336                        parse_single_index_config_param(&mut config, p);
337                    }
338                }
339            }
340        }
341    }
342
343    config
344}
345
346/// Parse a single index config parameter
347fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
348    use super::schema::VectorIndexType;
349
350    match p.as_rule() {
351        Rule::index_type_spec => {
352            config.index_type = Some(match p.as_str() {
353                "flat" => VectorIndexType::Flat,
354                "rabitq" => VectorIndexType::RaBitQ,
355                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
356                "scann" => VectorIndexType::ScaNN,
357                _ => VectorIndexType::RaBitQ,
358            });
359        }
360        Rule::index_type_kwarg => {
361            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
362            if let Some(t) = p.into_inner().next() {
363                config.index_type = Some(match t.as_str() {
364                    "flat" => VectorIndexType::Flat,
365                    "rabitq" => VectorIndexType::RaBitQ,
366                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
367                    "scann" => VectorIndexType::ScaNN,
368                    _ => VectorIndexType::RaBitQ,
369                });
370            }
371        }
372        Rule::num_clusters_kwarg => {
373            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
374            if let Some(n) = p.into_inner().next() {
375                config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
376                    log::warn!(
377                        "Invalid num_clusters value '{}', using default 256",
378                        n.as_str()
379                    );
380                    256
381                }));
382            }
383        }
384        Rule::build_threshold_kwarg => {
385            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
386            if let Some(n) = p.into_inner().next() {
387                config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
388                    log::warn!(
389                        "Invalid build_threshold value '{}', using default 10000",
390                        n.as_str()
391                    );
392                    10000
393                }));
394            }
395        }
396        Rule::nprobe_kwarg => {
397            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
398            if let Some(n) = p.into_inner().next() {
399                config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
400                    log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
401                    32
402                }));
403            }
404        }
405        Rule::quantization_kwarg => {
406            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
407            if let Some(q) = p.into_inner().next() {
408                config.quantization = Some(match q.as_str() {
409                    "float32" | "f32" => WeightQuantization::Float32,
410                    "float16" | "f16" => WeightQuantization::Float16,
411                    "uint8" | "u8" => WeightQuantization::UInt8,
412                    "uint4" | "u4" => WeightQuantization::UInt4,
413                    _ => WeightQuantization::default(),
414                });
415            }
416        }
417        Rule::weight_threshold_kwarg => {
418            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
419            if let Some(t) = p.into_inner().next() {
420                config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
421                    log::warn!(
422                        "Invalid weight_threshold value '{}', using default 0.0",
423                        t.as_str()
424                    );
425                    0.0
426                }));
427            }
428        }
429        Rule::block_size_kwarg => {
430            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
431            if let Some(n) = p.into_inner().next() {
432                config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
433                    log::warn!(
434                        "Invalid block_size value '{}', using default 128",
435                        n.as_str()
436                    );
437                    128
438                }));
439            }
440        }
441        Rule::pruning_kwarg => {
442            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
443            if let Some(f) = p.into_inner().next() {
444                config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
445                    log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
446                    1.0
447                }));
448            }
449        }
450        Rule::min_terms_kwarg => {
451            if let Some(n) = p.into_inner().next() {
452                config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
453                    log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
454                    4
455                }));
456            }
457        }
458        Rule::sparse_format_kwarg => {
459            // sparse_format_kwarg = { "format" ~ ":" ~ sparse_format_spec }
460            if let Some(f) = p.into_inner().next() {
461                config.sparse_format = Some(match f.as_str() {
462                    "bmp" => SparseFormat::Bmp,
463                    "maxscore" => SparseFormat::MaxScore,
464                    _ => SparseFormat::default(),
465                });
466            }
467        }
468        Rule::sparse_dims_kwarg => {
469            if let Some(n) = p.into_inner().next() {
470                config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
471                    log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
472                    105879
473                }));
474            }
475        }
476        Rule::sparse_max_weight_kwarg => {
477            if let Some(f) = p.into_inner().next() {
478                config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
479                    log::warn!(
480                        "Invalid max_weight value '{}', using default 5.0",
481                        f.as_str()
482                    );
483                    5.0
484                }));
485            }
486        }
487        Rule::query_config_block => {
488            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
489            parse_query_config_block(config, p);
490        }
491        Rule::positions_kwarg => {
492            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
493            use super::schema::PositionMode;
494            config.positions = Some(match p.as_str() {
495                "ordinal" => PositionMode::Ordinal,
496                "token_position" => PositionMode::TokenPosition,
497                _ => PositionMode::Full, // "positions" or any other value defaults to Full
498            });
499        }
500        _ => {}
501    }
502}
503
504/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
505fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
506    for inner in pair.into_inner() {
507        if inner.as_rule() == Rule::query_config_params {
508            for param in inner.into_inner() {
509                if param.as_rule() == Rule::query_config_param {
510                    for p in param.into_inner() {
511                        match p.as_rule() {
512                            Rule::query_tokenizer_kwarg => {
513                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
514                                if let Some(path) = p.into_inner().next()
515                                    && let Some(inner_path) = path.into_inner().next()
516                                {
517                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
518                                }
519                            }
520                            Rule::query_weighting_kwarg => {
521                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
522                                if let Some(w) = p.into_inner().next() {
523                                    config.query_weighting = Some(match w.as_str() {
524                                        "one" => QueryWeighting::One,
525                                        "idf" => QueryWeighting::Idf,
526                                        "idf_file" => QueryWeighting::IdfFile,
527                                        _ => QueryWeighting::One,
528                                    });
529                                }
530                            }
531                            Rule::query_weight_threshold_kwarg => {
532                                if let Some(t) = p.into_inner().next() {
533                                    config.query_weight_threshold =
534                                        Some(t.as_str().parse().unwrap_or_else(|_| {
535                                            log::warn!(
536                                                "Invalid query weight_threshold '{}', using 0.0",
537                                                t.as_str()
538                                            );
539                                            0.0
540                                        }));
541                                }
542                            }
543                            Rule::query_max_dims_kwarg => {
544                                if let Some(t) = p.into_inner().next() {
545                                    config.query_max_dims =
546                                        Some(t.as_str().parse().unwrap_or_else(|_| {
547                                            log::warn!(
548                                                "Invalid query max_dims '{}', using 0",
549                                                t.as_str()
550                                            );
551                                            0
552                                        }));
553                                }
554                            }
555                            Rule::query_pruning_kwarg => {
556                                if let Some(t) = p.into_inner().next() {
557                                    config.query_pruning =
558                                        Some(t.as_str().parse().unwrap_or_else(|_| {
559                                            log::warn!(
560                                                "Invalid query pruning '{}', using 1.0",
561                                                t.as_str()
562                                            );
563                                            1.0
564                                        }));
565                                }
566                            }
567                            Rule::query_min_query_dims_kwarg => {
568                                if let Some(t) = p.into_inner().next() {
569                                    config.query_min_query_dims =
570                                        Some(t.as_str().parse().unwrap_or_else(|_| {
571                                            log::warn!(
572                                                "Invalid query min_query_dims '{}', using 4",
573                                                t.as_str()
574                                            );
575                                            4
576                                        }));
577                                }
578                            }
579                            _ => {}
580                        }
581                    }
582                }
583            }
584        }
585    }
586}
587
588/// Parse a field definition from pest pair
589fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
590    let mut inner = pair.into_inner();
591
592    let name = inner
593        .next()
594        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
595        .as_str()
596        .to_string();
597
598    let field_type_str = inner
599        .next()
600        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
601        .as_str();
602
603    let field_type = parse_field_type(field_type_str)?;
604
605    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
606    let mut tokenizer = None;
607    let mut sparse_vector_config = None;
608    let mut dense_vector_config = None;
609    let mut binary_dense_vector_config = None;
610    let mut indexed = true;
611    let mut stored = true;
612    let mut multi = false;
613    let mut fast = false;
614    let mut primary = false;
615    let mut reorder = false;
616    let mut index_config: Option<IndexConfig> = None;
617
618    for item in inner {
619        match item.as_rule() {
620            Rule::tokenizer_spec => {
621                // Extract tokenizer name from <name>
622                if let Some(tok_name) = item.into_inner().next() {
623                    tokenizer = Some(tok_name.as_str().to_string());
624                }
625            }
626            Rule::sparse_vector_config => {
627                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
628                sparse_vector_config = Some(parse_sparse_vector_config(item));
629            }
630            Rule::dense_vector_config => {
631                // Parse dense_vector_params (keyword or positional) - only dims
632                dense_vector_config = Some(parse_dense_vector_config(item));
633            }
634            Rule::binary_dense_vector_config => {
635                // Parse binary dense vector config - just dimension (number of bits)
636                let dim: usize = item
637                    .into_inner()
638                    .next()
639                    .map(|d| d.as_str().parse().unwrap_or(0))
640                    .unwrap_or(0);
641                if dim == 0 || !dim.is_multiple_of(8) {
642                    return Err(Error::Schema(format!(
643                        "BinaryDenseVector dimension must be a positive multiple of 8, got {dim}"
644                    )));
645                }
646                binary_dense_vector_config = Some(BinaryDenseVectorConfig::new(dim));
647            }
648            Rule::attributes => {
649                let attrs = parse_attributes(item);
650                indexed = attrs.indexed;
651                stored = attrs.stored;
652                multi = attrs.multi;
653                fast = attrs.fast;
654                primary = attrs.primary;
655                reorder = attrs.reorder;
656                index_config = attrs.index_config;
657            }
658            _ => {}
659        }
660    }
661
662    // PEG grammar ambiguity: both dense_vector_config and binary_dense_vector_config
663    // match `<N>`, and dense_vector_config comes first in the ordered choice. When the
664    // field_type is BinaryDenseVector, remap the matched dense_vector_config.
665    if field_type == FieldType::BinaryDenseVector
666        && binary_dense_vector_config.is_none()
667        && let Some(ref dv_config) = dense_vector_config
668    {
669        let dim = dv_config.dim;
670        if dim == 0 || !dim.is_multiple_of(8) {
671            return Err(Error::Schema(format!(
672                "BinaryDenseVector dimension must be a positive multiple of 8, got {dim}"
673            )));
674        }
675        binary_dense_vector_config = Some(BinaryDenseVectorConfig::new(dim));
676        dense_vector_config = None;
677    }
678
679    // Primary key implies fast + indexed (needed for dedup lookups)
680    if primary {
681        fast = true;
682        indexed = true;
683    }
684
685    // Merge index config into vector configs if both exist
686    let mut positions = None;
687    if let Some(idx_cfg) = index_config {
688        positions = idx_cfg.positions;
689        if let Some(ref mut dv_config) = dense_vector_config {
690            apply_index_config_to_dense_vector(dv_config, idx_cfg);
691        } else if field_type == FieldType::SparseVector {
692            // For sparse vectors, create default config if not present and apply index params
693            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
694            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
695        }
696    }
697
698    Ok(FieldDef {
699        name,
700        field_type,
701        indexed,
702        stored,
703        tokenizer,
704        multi,
705        positions,
706        sparse_vector_config,
707        dense_vector_config,
708        binary_dense_vector_config,
709        fast,
710        primary,
711        reorder,
712    })
713}
714
715/// Apply index configuration from indexed<...> to DenseVectorConfig
716fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
717    // Apply index type if specified
718    if let Some(index_type) = idx_cfg.index_type {
719        config.index_type = index_type;
720    }
721
722    // Apply num_clusters for IVF-based indexes
723    if idx_cfg.num_clusters.is_some() {
724        config.num_clusters = idx_cfg.num_clusters;
725    }
726
727    // Apply nprobe if specified
728    if let Some(nprobe) = idx_cfg.nprobe {
729        config.nprobe = nprobe;
730    }
731
732    // Apply build_threshold if specified
733    if idx_cfg.build_threshold.is_some() {
734        config.build_threshold = idx_cfg.build_threshold;
735    }
736}
737
738/// Parse sparse_vector_config - only index_size (positional)
739/// Example: <u16> or <u32>
740fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
741    let mut index_size = IndexSize::default();
742
743    // Parse positional index_size_spec
744    for inner in pair.into_inner() {
745        if inner.as_rule() == Rule::index_size_spec {
746            index_size = match inner.as_str() {
747                "u16" => IndexSize::U16,
748                "u32" => IndexSize::U32,
749                _ => IndexSize::default(),
750            };
751        }
752    }
753
754    SparseVectorConfig {
755        format: SparseFormat::default(),
756        index_size,
757        weight_quantization: WeightQuantization::default(),
758        weight_threshold: 0.0,
759        block_size: 128,
760        bmp_block_size: 64,
761        max_bmp_grid_bytes: 0,
762        bmp_superblock_size: 64,
763        pruning: None,
764        query_config: None,
765        dims: None,
766        max_weight: None,
767        min_terms: 4,
768    }
769}
770
771/// Apply index configuration from indexed<...> to SparseVectorConfig
772fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
773    if let Some(f) = idx_cfg.sparse_format {
774        config.format = f;
775    }
776    if let Some(q) = idx_cfg.quantization {
777        config.weight_quantization = q;
778    }
779    if let Some(t) = idx_cfg.weight_threshold {
780        config.weight_threshold = t;
781    }
782    if let Some(bs) = idx_cfg.block_size {
783        let adjusted = bs.next_power_of_two();
784        if adjusted != bs {
785            log::warn!(
786                "block_size {} adjusted to next power of two: {}",
787                bs,
788                adjusted
789            );
790        }
791        config.block_size = adjusted;
792    }
793    if let Some(p) = idx_cfg.pruning {
794        let clamped = p.clamp(0.0, 1.0);
795        if (clamped - p).abs() > f32::EPSILON {
796            log::warn!(
797                "pruning {} clamped to valid range [0.0, 1.0]: {}",
798                p,
799                clamped
800            );
801        }
802        config.pruning = Some(clamped);
803    }
804    if let Some(mt) = idx_cfg.min_terms {
805        config.min_terms = mt;
806    }
807    if let Some(d) = idx_cfg.dims {
808        config.dims = Some(d);
809    }
810    if let Some(mw) = idx_cfg.max_weight {
811        config.max_weight = Some(mw);
812    }
813    // Apply query-time configuration if present
814    if idx_cfg.query_tokenizer.is_some()
815        || idx_cfg.query_weighting.is_some()
816        || idx_cfg.query_weight_threshold.is_some()
817        || idx_cfg.query_max_dims.is_some()
818        || idx_cfg.query_pruning.is_some()
819        || idx_cfg.query_min_query_dims.is_some()
820    {
821        let query_config = config
822            .query_config
823            .get_or_insert(SparseQueryConfig::default());
824        if let Some(tokenizer) = idx_cfg.query_tokenizer {
825            query_config.tokenizer = Some(tokenizer);
826        }
827        if let Some(weighting) = idx_cfg.query_weighting {
828            query_config.weighting = weighting;
829        }
830        if let Some(t) = idx_cfg.query_weight_threshold {
831            query_config.weight_threshold = t;
832        }
833        if let Some(d) = idx_cfg.query_max_dims {
834            query_config.max_query_dims = Some(d);
835        }
836        if let Some(p) = idx_cfg.query_pruning {
837            query_config.pruning = Some(p);
838        }
839        if let Some(m) = idx_cfg.query_min_query_dims {
840            query_config.min_query_dims = m;
841        }
842    }
843}
844
845/// Parse dense_vector_config - dims and optional quantization type
846/// All index-related params are in indexed<...> attribute
847fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
848    let mut dim: usize = 0;
849    let mut quantization = DenseVectorQuantization::F32;
850
851    // Navigate to dense_vector_params
852    for params in pair.into_inner() {
853        if params.as_rule() == Rule::dense_vector_params {
854            for inner in params.into_inner() {
855                match inner.as_rule() {
856                    Rule::dense_vector_keyword_params => {
857                        for kwarg in inner.into_inner() {
858                            match kwarg.as_rule() {
859                                Rule::dims_kwarg => {
860                                    if let Some(d) = kwarg.into_inner().next() {
861                                        dim = d.as_str().parse().unwrap_or(0);
862                                    }
863                                }
864                                Rule::quant_type_spec => {
865                                    quantization = parse_quant_type(kwarg.as_str());
866                                }
867                                _ => {}
868                            }
869                        }
870                    }
871                    Rule::dense_vector_positional_params => {
872                        for item in inner.into_inner() {
873                            match item.as_rule() {
874                                Rule::dimension_spec => {
875                                    dim = item.as_str().parse().unwrap_or(0);
876                                }
877                                Rule::quant_type_spec => {
878                                    quantization = parse_quant_type(item.as_str());
879                                }
880                                _ => {}
881                            }
882                        }
883                    }
884                    _ => {}
885                }
886            }
887        }
888    }
889
890    DenseVectorConfig::new(dim).with_quantization(quantization)
891}
892
893fn parse_quant_type(s: &str) -> DenseVectorQuantization {
894    match s.trim() {
895        "f16" => DenseVectorQuantization::F16,
896        "uint8" | "u8" => DenseVectorQuantization::UInt8,
897        _ => DenseVectorQuantization::F32,
898    }
899}
900
901/// Parse default_fields definition
902fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
903    pair.into_inner().map(|p| p.as_str().to_string()).collect()
904}
905
906/// Parse a query router definition
907fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
908    let mut pattern = String::new();
909    let mut substitution = String::new();
910    let mut target_field = String::new();
911    let mut mode = RoutingMode::Additional;
912
913    for prop in pair.into_inner() {
914        if prop.as_rule() != Rule::query_router_prop {
915            continue;
916        }
917
918        for inner in prop.into_inner() {
919            match inner.as_rule() {
920                Rule::query_router_pattern => {
921                    if let Some(regex_str) = inner.into_inner().next() {
922                        pattern = parse_string_value(regex_str);
923                    }
924                }
925                Rule::query_router_substitution => {
926                    if let Some(quoted) = inner.into_inner().next() {
927                        substitution = parse_string_value(quoted);
928                    }
929                }
930                Rule::query_router_target => {
931                    if let Some(ident) = inner.into_inner().next() {
932                        target_field = ident.as_str().to_string();
933                    }
934                }
935                Rule::query_router_mode => {
936                    if let Some(mode_val) = inner.into_inner().next() {
937                        mode = match mode_val.as_str() {
938                            "exclusive" => RoutingMode::Exclusive,
939                            "additional" => RoutingMode::Additional,
940                            _ => RoutingMode::Additional,
941                        };
942                    }
943                }
944                _ => {}
945            }
946        }
947    }
948
949    if pattern.is_empty() {
950        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
951    }
952    if substitution.is_empty() {
953        return Err(Error::Schema(
954            "query_router missing 'substitution'".to_string(),
955        ));
956    }
957    if target_field.is_empty() {
958        return Err(Error::Schema(
959            "query_router missing 'target_field'".to_string(),
960        ));
961    }
962
963    Ok(QueryRouterRule {
964        pattern,
965        substitution,
966        target_field,
967        mode,
968    })
969}
970
971/// Parse a string value from quoted_string, raw_string, or regex_string
972fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
973    let s = pair.as_str();
974    match pair.as_rule() {
975        Rule::regex_string => {
976            // regex_string contains either raw_string or quoted_string
977            if let Some(inner) = pair.into_inner().next() {
978                parse_string_value(inner)
979            } else {
980                s.to_string()
981            }
982        }
983        Rule::raw_string => {
984            // r"..." - strip r" prefix and " suffix
985            s[2..s.len() - 1].to_string()
986        }
987        Rule::quoted_string => {
988            // "..." - strip quotes and handle escapes
989            let inner = &s[1..s.len() - 1];
990            // Simple escape handling
991            inner
992                .replace("\\n", "\n")
993                .replace("\\t", "\t")
994                .replace("\\\"", "\"")
995                .replace("\\\\", "\\")
996        }
997        _ => s.to_string(),
998    }
999}
1000
1001/// Parse an index definition from pest pair
1002fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
1003    let mut inner = pair.into_inner();
1004
1005    let name = inner
1006        .next()
1007        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
1008        .as_str()
1009        .to_string();
1010
1011    let mut fields = Vec::new();
1012    let mut default_fields = Vec::new();
1013    let mut query_routers = Vec::new();
1014
1015    for item in inner {
1016        match item.as_rule() {
1017            Rule::field_def => {
1018                fields.push(parse_field_def(item)?);
1019            }
1020            Rule::default_fields_def => {
1021                default_fields = parse_default_fields_def(item);
1022            }
1023            Rule::query_router_def => {
1024                query_routers.push(parse_query_router_def(item)?);
1025            }
1026            _ => {}
1027        }
1028    }
1029
1030    // Validate primary key constraints
1031    let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
1032    if primary_fields.len() > 1 {
1033        return Err(Error::Schema(format!(
1034            "Index '{}' has {} primary key fields, but at most one is allowed",
1035            name,
1036            primary_fields.len()
1037        )));
1038    }
1039    if let Some(pk) = primary_fields.first() {
1040        if pk.field_type != FieldType::Text {
1041            return Err(Error::Schema(format!(
1042                "Primary key field '{}' must be of type text, got {:?}",
1043                pk.name, pk.field_type
1044            )));
1045        }
1046        if pk.multi {
1047            return Err(Error::Schema(format!(
1048                "Primary key field '{}' cannot be multi-valued",
1049                pk.name
1050            )));
1051        }
1052    }
1053
1054    Ok(IndexDef {
1055        name,
1056        fields,
1057        default_fields,
1058        query_routers,
1059    })
1060}
1061
1062/// Parse SDL from a string
1063pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
1064    let pairs = SdlParser::parse(Rule::file, input)
1065        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1066
1067    let mut indexes = Vec::new();
1068
1069    for pair in pairs {
1070        if pair.as_rule() == Rule::file {
1071            for inner in pair.into_inner() {
1072                if inner.as_rule() == Rule::index_def {
1073                    indexes.push(parse_index_def(inner)?);
1074                }
1075            }
1076        }
1077    }
1078
1079    Ok(indexes)
1080}
1081
1082/// Parse SDL and return a single index definition
1083pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1084    let indexes = parse_sdl(input)?;
1085
1086    if indexes.is_empty() {
1087        return Err(Error::Schema("No index definition found".to_string()));
1088    }
1089
1090    if indexes.len() > 1 {
1091        return Err(Error::Schema(
1092            "Multiple index definitions found, expected one".to_string(),
1093        ));
1094    }
1095
1096    Ok(indexes.into_iter().next().unwrap())
1097}
1098
1099#[cfg(test)]
1100mod tests {
1101    use super::*;
1102
1103    #[test]
1104    fn test_parse_simple_schema() {
1105        let sdl = r#"
1106            index articles {
1107                field title: text [indexed, stored]
1108                field body: text [indexed]
1109            }
1110        "#;
1111
1112        let indexes = parse_sdl(sdl).unwrap();
1113        assert_eq!(indexes.len(), 1);
1114
1115        let index = &indexes[0];
1116        assert_eq!(index.name, "articles");
1117        assert_eq!(index.fields.len(), 2);
1118
1119        assert_eq!(index.fields[0].name, "title");
1120        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1121        assert!(index.fields[0].indexed);
1122        assert!(index.fields[0].stored);
1123
1124        assert_eq!(index.fields[1].name, "body");
1125        assert!(matches!(index.fields[1].field_type, FieldType::Text));
1126        assert!(index.fields[1].indexed);
1127        assert!(!index.fields[1].stored);
1128    }
1129
1130    #[test]
1131    fn test_parse_all_field_types() {
1132        let sdl = r#"
1133            index test {
1134                field text_field: text [indexed, stored]
1135                field u64_field: u64 [indexed, stored]
1136                field i64_field: i64 [indexed, stored]
1137                field f64_field: f64 [indexed, stored]
1138                field bytes_field: bytes [stored]
1139            }
1140        "#;
1141
1142        let indexes = parse_sdl(sdl).unwrap();
1143        let index = &indexes[0];
1144
1145        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1146        assert!(matches!(index.fields[1].field_type, FieldType::U64));
1147        assert!(matches!(index.fields[2].field_type, FieldType::I64));
1148        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1149        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1150    }
1151
1152    #[test]
1153    fn test_parse_with_comments() {
1154        let sdl = r#"
1155            # This is a comment
1156            index articles {
1157                # Title field
1158                field title: text [indexed, stored]
1159                field body: text [indexed] # inline comment not supported yet
1160            }
1161        "#;
1162
1163        let indexes = parse_sdl(sdl).unwrap();
1164        assert_eq!(indexes[0].fields.len(), 2);
1165    }
1166
1167    #[test]
1168    fn test_parse_type_aliases() {
1169        let sdl = r#"
1170            index test {
1171                field a: string [indexed]
1172                field b: int [indexed]
1173                field c: uint [indexed]
1174                field d: float [indexed]
1175                field e: binary [stored]
1176            }
1177        "#;
1178
1179        let indexes = parse_sdl(sdl).unwrap();
1180        let index = &indexes[0];
1181
1182        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1183        assert!(matches!(index.fields[1].field_type, FieldType::I64));
1184        assert!(matches!(index.fields[2].field_type, FieldType::U64));
1185        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1186        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1187    }
1188
1189    #[test]
1190    fn test_to_schema() {
1191        let sdl = r#"
1192            index articles {
1193                field title: text [indexed, stored]
1194                field views: u64 [indexed, stored]
1195            }
1196        "#;
1197
1198        let indexes = parse_sdl(sdl).unwrap();
1199        let schema = indexes[0].to_schema();
1200
1201        assert!(schema.get_field("title").is_some());
1202        assert!(schema.get_field("views").is_some());
1203        assert!(schema.get_field("nonexistent").is_none());
1204    }
1205
1206    #[test]
1207    fn test_default_attributes() {
1208        let sdl = r#"
1209            index test {
1210                field title: text
1211            }
1212        "#;
1213
1214        let indexes = parse_sdl(sdl).unwrap();
1215        let field = &indexes[0].fields[0];
1216
1217        // Default should be indexed and stored
1218        assert!(field.indexed);
1219        assert!(field.stored);
1220    }
1221
1222    #[test]
1223    fn test_multiple_indexes() {
1224        let sdl = r#"
1225            index articles {
1226                field title: text [indexed, stored]
1227            }
1228
1229            index users {
1230                field name: text [indexed, stored]
1231                field email: text [indexed, stored]
1232            }
1233        "#;
1234
1235        let indexes = parse_sdl(sdl).unwrap();
1236        assert_eq!(indexes.len(), 2);
1237        assert_eq!(indexes[0].name, "articles");
1238        assert_eq!(indexes[1].name, "users");
1239    }
1240
1241    #[test]
1242    fn test_tokenizer_spec() {
1243        let sdl = r#"
1244            index articles {
1245                field title: text<en_stem> [indexed, stored]
1246                field body: text<simple> [indexed]
1247                field author: text [indexed, stored]
1248            }
1249        "#;
1250
1251        let indexes = parse_sdl(sdl).unwrap();
1252        let index = &indexes[0];
1253
1254        assert_eq!(index.fields[0].name, "title");
1255        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1256
1257        assert_eq!(index.fields[1].name, "body");
1258        assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1259
1260        assert_eq!(index.fields[2].name, "author");
1261        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1262    }
1263
1264    #[test]
1265    fn test_tokenizer_in_schema() {
1266        let sdl = r#"
1267            index articles {
1268                field title: text<german> [indexed, stored]
1269                field body: text<en_stem> [indexed]
1270            }
1271        "#;
1272
1273        let indexes = parse_sdl(sdl).unwrap();
1274        let schema = indexes[0].to_schema();
1275
1276        let title_field = schema.get_field("title").unwrap();
1277        let title_entry = schema.get_field_entry(title_field).unwrap();
1278        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1279
1280        let body_field = schema.get_field("body").unwrap();
1281        let body_entry = schema.get_field_entry(body_field).unwrap();
1282        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1283    }
1284
1285    #[test]
1286    fn test_query_router_basic() {
1287        let sdl = r#"
1288            index documents {
1289                field title: text [indexed, stored]
1290                field uri: text [indexed, stored]
1291
1292                query_router {
1293                    pattern: "10\\.\\d{4,}/[^\\s]+"
1294                    substitution: "doi://{0}"
1295                    target_field: uris
1296                    mode: exclusive
1297                }
1298            }
1299        "#;
1300
1301        let indexes = parse_sdl(sdl).unwrap();
1302        let index = &indexes[0];
1303
1304        assert_eq!(index.query_routers.len(), 1);
1305        let router = &index.query_routers[0];
1306        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1307        assert_eq!(router.substitution, "doi://{0}");
1308        assert_eq!(router.target_field, "uris");
1309        assert_eq!(router.mode, RoutingMode::Exclusive);
1310    }
1311
1312    #[test]
1313    fn test_query_router_raw_string() {
1314        let sdl = r#"
1315            index documents {
1316                field uris: text [indexed, stored]
1317
1318                query_router {
1319                    pattern: r"^pmid:(\d+)$"
1320                    substitution: "pubmed://{1}"
1321                    target_field: uris
1322                    mode: additional
1323                }
1324            }
1325        "#;
1326
1327        let indexes = parse_sdl(sdl).unwrap();
1328        let router = &indexes[0].query_routers[0];
1329
1330        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1331        assert_eq!(router.substitution, "pubmed://{1}");
1332        assert_eq!(router.mode, RoutingMode::Additional);
1333    }
1334
1335    #[test]
1336    fn test_multiple_query_routers() {
1337        let sdl = r#"
1338            index documents {
1339                field uris: text [indexed, stored]
1340
1341                query_router {
1342                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1343                    substitution: "doi://{1}"
1344                    target_field: uris
1345                    mode: exclusive
1346                }
1347
1348                query_router {
1349                    pattern: r"^pmid:(\d+)$"
1350                    substitution: "pubmed://{1}"
1351                    target_field: uris
1352                    mode: exclusive
1353                }
1354
1355                query_router {
1356                    pattern: r"^arxiv:(\d+\.\d+)$"
1357                    substitution: "arxiv://{1}"
1358                    target_field: uris
1359                    mode: additional
1360                }
1361            }
1362        "#;
1363
1364        let indexes = parse_sdl(sdl).unwrap();
1365        assert_eq!(indexes[0].query_routers.len(), 3);
1366    }
1367
1368    #[test]
1369    fn test_query_router_default_mode() {
1370        let sdl = r#"
1371            index documents {
1372                field uris: text [indexed, stored]
1373
1374                query_router {
1375                    pattern: r"test"
1376                    substitution: "{0}"
1377                    target_field: uris
1378                }
1379            }
1380        "#;
1381
1382        let indexes = parse_sdl(sdl).unwrap();
1383        // Default mode should be Additional
1384        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1385    }
1386
1387    #[test]
1388    fn test_multi_attribute() {
1389        let sdl = r#"
1390            index documents {
1391                field uris: text [indexed, stored<multi>]
1392                field title: text [indexed, stored]
1393            }
1394        "#;
1395
1396        let indexes = parse_sdl(sdl).unwrap();
1397        assert_eq!(indexes.len(), 1);
1398
1399        let fields = &indexes[0].fields;
1400        assert_eq!(fields.len(), 2);
1401
1402        // uris should have multi=true
1403        assert_eq!(fields[0].name, "uris");
1404        assert!(fields[0].multi, "uris field should have multi=true");
1405
1406        // title should have multi=false
1407        assert_eq!(fields[1].name, "title");
1408        assert!(!fields[1].multi, "title field should have multi=false");
1409
1410        // Verify schema conversion preserves multi attribute
1411        let schema = indexes[0].to_schema();
1412        let uris_field = schema.get_field("uris").unwrap();
1413        let title_field = schema.get_field("title").unwrap();
1414
1415        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1416        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1417    }
1418
1419    #[test]
1420    fn test_sparse_vector_field() {
1421        let sdl = r#"
1422            index documents {
1423                field embedding: sparse_vector [indexed, stored]
1424            }
1425        "#;
1426
1427        let indexes = parse_sdl(sdl).unwrap();
1428        assert_eq!(indexes.len(), 1);
1429        assert_eq!(indexes[0].fields.len(), 1);
1430        assert_eq!(indexes[0].fields[0].name, "embedding");
1431        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1432        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1433    }
1434
1435    #[test]
1436    fn test_sparse_vector_with_config() {
1437        let sdl = r#"
1438            index documents {
1439                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1440                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1441            }
1442        "#;
1443
1444        let indexes = parse_sdl(sdl).unwrap();
1445        assert_eq!(indexes[0].fields.len(), 2);
1446
1447        // First field: u16 indices, uint8 quantization
1448        let f1 = &indexes[0].fields[0];
1449        assert_eq!(f1.name, "embedding");
1450        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1451        assert_eq!(config1.index_size, IndexSize::U16);
1452        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1453
1454        // Second field: u32 indices, float32 quantization
1455        let f2 = &indexes[0].fields[1];
1456        assert_eq!(f2.name, "dense");
1457        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1458        assert_eq!(config2.index_size, IndexSize::U32);
1459        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1460    }
1461
1462    #[test]
1463    fn test_sparse_vector_with_weight_threshold() {
1464        let sdl = r#"
1465            index documents {
1466                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1467                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1468            }
1469        "#;
1470
1471        let indexes = parse_sdl(sdl).unwrap();
1472        assert_eq!(indexes[0].fields.len(), 2);
1473
1474        // First field: u16 indices, uint8 quantization, threshold 0.1
1475        let f1 = &indexes[0].fields[0];
1476        assert_eq!(f1.name, "embedding");
1477        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1478        assert_eq!(config1.index_size, IndexSize::U16);
1479        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1480        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1481
1482        // Second field: u32 indices, float16 quantization, threshold 0.05
1483        let f2 = &indexes[0].fields[1];
1484        assert_eq!(f2.name, "embedding2");
1485        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1486        assert_eq!(config2.index_size, IndexSize::U32);
1487        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1488        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1489    }
1490
1491    #[test]
1492    fn test_sparse_vector_with_pruning() {
1493        let sdl = r#"
1494            index documents {
1495                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1496            }
1497        "#;
1498
1499        let indexes = parse_sdl(sdl).unwrap();
1500        let f = &indexes[0].fields[0];
1501        assert_eq!(f.name, "embedding");
1502        let config = f.sparse_vector_config.as_ref().unwrap();
1503        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1504        assert_eq!(config.pruning, Some(0.1));
1505    }
1506
1507    #[test]
1508    fn test_dense_vector_field() {
1509        let sdl = r#"
1510            index documents {
1511                field embedding: dense_vector<768> [indexed, stored]
1512            }
1513        "#;
1514
1515        let indexes = parse_sdl(sdl).unwrap();
1516        assert_eq!(indexes.len(), 1);
1517        assert_eq!(indexes[0].fields.len(), 1);
1518
1519        let f = &indexes[0].fields[0];
1520        assert_eq!(f.name, "embedding");
1521        assert_eq!(f.field_type, FieldType::DenseVector);
1522
1523        let config = f.dense_vector_config.as_ref().unwrap();
1524        assert_eq!(config.dim, 768);
1525    }
1526
1527    #[test]
1528    fn test_dense_vector_alias() {
1529        let sdl = r#"
1530            index documents {
1531                field embedding: vector<1536> [indexed]
1532            }
1533        "#;
1534
1535        let indexes = parse_sdl(sdl).unwrap();
1536        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1537        assert_eq!(
1538            indexes[0].fields[0]
1539                .dense_vector_config
1540                .as_ref()
1541                .unwrap()
1542                .dim,
1543            1536
1544        );
1545    }
1546
1547    #[test]
1548    fn test_dense_vector_with_num_clusters() {
1549        let sdl = r#"
1550            index documents {
1551                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1552            }
1553        "#;
1554
1555        let indexes = parse_sdl(sdl).unwrap();
1556        assert_eq!(indexes.len(), 1);
1557
1558        let f = &indexes[0].fields[0];
1559        assert_eq!(f.name, "embedding");
1560        assert_eq!(f.field_type, FieldType::DenseVector);
1561
1562        let config = f.dense_vector_config.as_ref().unwrap();
1563        assert_eq!(config.dim, 768);
1564        assert_eq!(config.num_clusters, Some(256));
1565        assert_eq!(config.nprobe, 32); // default
1566    }
1567
1568    #[test]
1569    fn test_dense_vector_with_num_clusters_and_nprobe() {
1570        let sdl = r#"
1571            index documents {
1572                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1573            }
1574        "#;
1575
1576        let indexes = parse_sdl(sdl).unwrap();
1577        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1578
1579        assert_eq!(config.dim, 1536);
1580        assert_eq!(config.num_clusters, Some(512));
1581        assert_eq!(config.nprobe, 64);
1582    }
1583
1584    #[test]
1585    fn test_dense_vector_keyword_syntax() {
1586        let sdl = r#"
1587            index documents {
1588                field embedding: dense_vector<dims: 1536> [indexed, stored]
1589            }
1590        "#;
1591
1592        let indexes = parse_sdl(sdl).unwrap();
1593        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1594
1595        assert_eq!(config.dim, 1536);
1596        assert!(config.num_clusters.is_none());
1597    }
1598
1599    #[test]
1600    fn test_dense_vector_keyword_syntax_full() {
1601        let sdl = r#"
1602            index documents {
1603                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1604            }
1605        "#;
1606
1607        let indexes = parse_sdl(sdl).unwrap();
1608        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1609
1610        assert_eq!(config.dim, 1536);
1611        assert_eq!(config.num_clusters, Some(256));
1612        assert_eq!(config.nprobe, 64);
1613    }
1614
1615    #[test]
1616    fn test_dense_vector_keyword_syntax_partial() {
1617        let sdl = r#"
1618            index documents {
1619                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1620            }
1621        "#;
1622
1623        let indexes = parse_sdl(sdl).unwrap();
1624        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1625
1626        assert_eq!(config.dim, 768);
1627        assert_eq!(config.num_clusters, Some(128));
1628        assert_eq!(config.nprobe, 32); // default
1629    }
1630
1631    #[test]
1632    fn test_dense_vector_scann_index() {
1633        use crate::dsl::schema::VectorIndexType;
1634
1635        let sdl = r#"
1636            index documents {
1637                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1638            }
1639        "#;
1640
1641        let indexes = parse_sdl(sdl).unwrap();
1642        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1643
1644        assert_eq!(config.dim, 768);
1645        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1646        assert_eq!(config.num_clusters, Some(256));
1647        assert_eq!(config.nprobe, 64);
1648    }
1649
1650    #[test]
1651    fn test_dense_vector_ivf_rabitq_index() {
1652        use crate::dsl::schema::VectorIndexType;
1653
1654        let sdl = r#"
1655            index documents {
1656                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1657            }
1658        "#;
1659
1660        let indexes = parse_sdl(sdl).unwrap();
1661        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1662
1663        assert_eq!(config.dim, 1536);
1664        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1665        assert_eq!(config.num_clusters, Some(512));
1666    }
1667
1668    #[test]
1669    fn test_dense_vector_rabitq_no_clusters() {
1670        use crate::dsl::schema::VectorIndexType;
1671
1672        let sdl = r#"
1673            index documents {
1674                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1675            }
1676        "#;
1677
1678        let indexes = parse_sdl(sdl).unwrap();
1679        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1680
1681        assert_eq!(config.dim, 768);
1682        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1683        assert!(config.num_clusters.is_none());
1684    }
1685
1686    #[test]
1687    fn test_dense_vector_flat_index() {
1688        use crate::dsl::schema::VectorIndexType;
1689
1690        let sdl = r#"
1691            index documents {
1692                field embedding: dense_vector<dims: 768> [indexed<flat>]
1693            }
1694        "#;
1695
1696        let indexes = parse_sdl(sdl).unwrap();
1697        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1698
1699        assert_eq!(config.dim, 768);
1700        assert_eq!(config.index_type, VectorIndexType::Flat);
1701    }
1702
1703    #[test]
1704    fn test_dense_vector_default_index_type() {
1705        use crate::dsl::schema::VectorIndexType;
1706
1707        // When no index type specified, should default to RaBitQ (basic)
1708        let sdl = r#"
1709            index documents {
1710                field embedding: dense_vector<dims: 768> [indexed]
1711            }
1712        "#;
1713
1714        let indexes = parse_sdl(sdl).unwrap();
1715        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1716
1717        assert_eq!(config.dim, 768);
1718        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1719    }
1720
1721    #[test]
1722    fn test_dense_vector_f16_quantization() {
1723        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1724
1725        let sdl = r#"
1726            index documents {
1727                field embedding: dense_vector<768, f16> [indexed]
1728            }
1729        "#;
1730
1731        let indexes = parse_sdl(sdl).unwrap();
1732        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1733
1734        assert_eq!(config.dim, 768);
1735        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1736        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1737    }
1738
1739    #[test]
1740    fn test_dense_vector_uint8_quantization() {
1741        use crate::dsl::schema::DenseVectorQuantization;
1742
1743        let sdl = r#"
1744            index documents {
1745                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1746            }
1747        "#;
1748
1749        let indexes = parse_sdl(sdl).unwrap();
1750        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1751
1752        assert_eq!(config.dim, 1024);
1753        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1754    }
1755
1756    #[test]
1757    fn test_dense_vector_u8_alias() {
1758        use crate::dsl::schema::DenseVectorQuantization;
1759
1760        let sdl = r#"
1761            index documents {
1762                field embedding: dense_vector<512, u8> [indexed]
1763            }
1764        "#;
1765
1766        let indexes = parse_sdl(sdl).unwrap();
1767        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1768
1769        assert_eq!(config.dim, 512);
1770        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1771    }
1772
1773    #[test]
1774    fn test_dense_vector_default_f32_quantization() {
1775        use crate::dsl::schema::DenseVectorQuantization;
1776
1777        // No quantization type → default f32
1778        let sdl = r#"
1779            index documents {
1780                field embedding: dense_vector<768> [indexed]
1781            }
1782        "#;
1783
1784        let indexes = parse_sdl(sdl).unwrap();
1785        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1786
1787        assert_eq!(config.dim, 768);
1788        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1789    }
1790
1791    #[test]
1792    fn test_dense_vector_keyword_with_quantization() {
1793        use crate::dsl::schema::DenseVectorQuantization;
1794
1795        let sdl = r#"
1796            index documents {
1797                field embedding: dense_vector<dims: 768, f16> [indexed]
1798            }
1799        "#;
1800
1801        let indexes = parse_sdl(sdl).unwrap();
1802        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1803
1804        assert_eq!(config.dim, 768);
1805        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1806    }
1807
1808    #[test]
1809    fn test_json_field_type() {
1810        let sdl = r#"
1811            index documents {
1812                field title: text [indexed, stored]
1813                field metadata: json [stored]
1814                field extra: json
1815            }
1816        "#;
1817
1818        let indexes = parse_sdl(sdl).unwrap();
1819        let index = &indexes[0];
1820
1821        assert_eq!(index.fields.len(), 3);
1822
1823        // Check JSON field
1824        assert_eq!(index.fields[1].name, "metadata");
1825        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1826        assert!(index.fields[1].stored);
1827        // JSON fields should not be indexed (enforced by add_json_field)
1828
1829        // Check default attributes for JSON field
1830        assert_eq!(index.fields[2].name, "extra");
1831        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1832
1833        // Verify schema conversion
1834        let schema = index.to_schema();
1835        let metadata_field = schema.get_field("metadata").unwrap();
1836        let entry = schema.get_field_entry(metadata_field).unwrap();
1837        assert_eq!(entry.field_type, FieldType::Json);
1838        assert!(!entry.indexed); // JSON fields are never indexed
1839        assert!(entry.stored);
1840    }
1841
1842    #[test]
1843    fn test_sparse_vector_query_config() {
1844        use crate::structures::QueryWeighting;
1845
1846        let sdl = r#"
1847            index documents {
1848                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1849            }
1850        "#;
1851
1852        let indexes = parse_sdl(sdl).unwrap();
1853        let index = &indexes[0];
1854
1855        assert_eq!(index.fields.len(), 1);
1856        assert_eq!(index.fields[0].name, "embedding");
1857        assert!(matches!(
1858            index.fields[0].field_type,
1859            FieldType::SparseVector
1860        ));
1861
1862        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1863        assert_eq!(config.index_size, IndexSize::U16);
1864        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1865
1866        // Check query config
1867        let query_config = config.query_config.as_ref().unwrap();
1868        assert_eq!(
1869            query_config.tokenizer.as_deref(),
1870            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1871        );
1872        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1873
1874        // Verify schema conversion preserves query config
1875        let schema = index.to_schema();
1876        let embedding_field = schema.get_field("embedding").unwrap();
1877        let entry = schema.get_field_entry(embedding_field).unwrap();
1878        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1879        let qc = sv_config.query_config.as_ref().unwrap();
1880        assert_eq!(
1881            qc.tokenizer.as_deref(),
1882            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1883        );
1884        assert_eq!(qc.weighting, QueryWeighting::Idf);
1885    }
1886
1887    #[test]
1888    fn test_sparse_vector_query_config_weighting_one() {
1889        use crate::structures::QueryWeighting;
1890
1891        let sdl = r#"
1892            index documents {
1893                field embedding: sparse_vector [indexed<query<weighting: one>>]
1894            }
1895        "#;
1896
1897        let indexes = parse_sdl(sdl).unwrap();
1898        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1899
1900        let query_config = config.query_config.as_ref().unwrap();
1901        assert!(query_config.tokenizer.is_none());
1902        assert_eq!(query_config.weighting, QueryWeighting::One);
1903    }
1904
1905    #[test]
1906    fn test_sparse_vector_query_config_weighting_idf_file() {
1907        use crate::structures::QueryWeighting;
1908
1909        let sdl = r#"
1910            index documents {
1911                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1912            }
1913        "#;
1914
1915        let indexes = parse_sdl(sdl).unwrap();
1916        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1917
1918        let query_config = config.query_config.as_ref().unwrap();
1919        assert_eq!(
1920            query_config.tokenizer.as_deref(),
1921            Some("opensearch-neural-sparse-encoding-v1")
1922        );
1923        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1924
1925        // Verify schema conversion preserves idf_file
1926        let schema = indexes[0].to_schema();
1927        let field = schema.get_field("embedding").unwrap();
1928        let entry = schema.get_field_entry(field).unwrap();
1929        let sc = entry.sparse_vector_config.as_ref().unwrap();
1930        let qc = sc.query_config.as_ref().unwrap();
1931        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1932    }
1933
1934    #[test]
1935    fn test_sparse_vector_query_config_pruning_params() {
1936        let sdl = r#"
1937            index documents {
1938                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1939            }
1940        "#;
1941
1942        let indexes = parse_sdl(sdl).unwrap();
1943        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1944
1945        let qc = config.query_config.as_ref().unwrap();
1946        assert_eq!(qc.weighting, QueryWeighting::Idf);
1947        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1948        assert_eq!(qc.max_query_dims, Some(25));
1949        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1950
1951        // Verify schema roundtrip
1952        let schema = indexes[0].to_schema();
1953        let field = schema.get_field("embedding").unwrap();
1954        let entry = schema.get_field_entry(field).unwrap();
1955        let sc = entry.sparse_vector_config.as_ref().unwrap();
1956        let rqc = sc.query_config.as_ref().unwrap();
1957        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1958        assert_eq!(rqc.max_query_dims, Some(25));
1959        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1960    }
1961
1962    #[test]
1963    fn test_sparse_vector_format_maxscore() {
1964        let sdl = r#"
1965            index documents {
1966                field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1967            }
1968        "#;
1969
1970        let indexes = parse_sdl(sdl).unwrap();
1971        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1972        assert_eq!(config.format, SparseFormat::MaxScore);
1973        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1974
1975        // Verify schema roundtrip
1976        let schema = indexes[0].to_schema();
1977        let field = schema.get_field("embedding").unwrap();
1978        let entry = schema.get_field_entry(field).unwrap();
1979        let sc = entry.sparse_vector_config.as_ref().unwrap();
1980        assert_eq!(sc.format, SparseFormat::MaxScore);
1981    }
1982
1983    #[test]
1984    fn test_sparse_vector_format_bmp() {
1985        let sdl = r#"
1986            index documents {
1987                field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1988            }
1989        "#;
1990
1991        let indexes = parse_sdl(sdl).unwrap();
1992        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1993        assert_eq!(config.format, SparseFormat::Bmp);
1994    }
1995
1996    #[test]
1997    fn test_fast_attribute() {
1998        let sdl = r#"
1999            index products {
2000                field name: text [indexed, stored]
2001                field price: f64 [indexed, fast]
2002                field category: text [indexed, stored, fast]
2003                field count: u64 [fast]
2004                field score: i64 [indexed, stored, fast]
2005            }
2006        "#;
2007
2008        let indexes = parse_sdl(sdl).unwrap();
2009        assert_eq!(indexes.len(), 1);
2010        let index = &indexes[0];
2011        assert_eq!(index.fields.len(), 5);
2012
2013        // name: no fast
2014        assert!(!index.fields[0].fast);
2015        // price: fast
2016        assert!(index.fields[1].fast);
2017        assert!(matches!(index.fields[1].field_type, FieldType::F64));
2018        // category: fast text
2019        assert!(index.fields[2].fast);
2020        assert!(matches!(index.fields[2].field_type, FieldType::Text));
2021        // count: fast only
2022        assert!(index.fields[3].fast);
2023        assert!(matches!(index.fields[3].field_type, FieldType::U64));
2024        // score: fast i64
2025        assert!(index.fields[4].fast);
2026        assert!(matches!(index.fields[4].field_type, FieldType::I64));
2027
2028        // Verify schema roundtrip preserves fast flag
2029        let schema = index.to_schema();
2030        let price_field = schema.get_field("price").unwrap();
2031        assert!(schema.get_field_entry(price_field).unwrap().fast);
2032
2033        let category_field = schema.get_field("category").unwrap();
2034        assert!(schema.get_field_entry(category_field).unwrap().fast);
2035
2036        let name_field = schema.get_field("name").unwrap();
2037        assert!(!schema.get_field_entry(name_field).unwrap().fast);
2038    }
2039
2040    #[test]
2041    fn test_primary_attribute() {
2042        let sdl = r#"
2043            index documents {
2044                field id: text [primary, stored]
2045                field title: text [indexed, stored]
2046            }
2047        "#;
2048
2049        let indexes = parse_sdl(sdl).unwrap();
2050        assert_eq!(indexes.len(), 1);
2051        let index = &indexes[0];
2052        assert_eq!(index.fields.len(), 2);
2053
2054        // id should be primary, and auto-set fast + indexed
2055        let id_field = &index.fields[0];
2056        assert!(id_field.primary, "id should be primary");
2057        assert!(id_field.fast, "primary implies fast");
2058        assert!(id_field.indexed, "primary implies indexed");
2059
2060        // title should NOT be primary
2061        assert!(!index.fields[1].primary);
2062
2063        // Verify schema conversion preserves primary_key
2064        let schema = index.to_schema();
2065        let id = schema.get_field("id").unwrap();
2066        let id_entry = schema.get_field_entry(id).unwrap();
2067        assert!(id_entry.primary_key);
2068        assert!(id_entry.fast);
2069        assert!(id_entry.indexed);
2070
2071        let title = schema.get_field("title").unwrap();
2072        assert!(!schema.get_field_entry(title).unwrap().primary_key);
2073
2074        // primary_field() should return the primary field
2075        assert_eq!(schema.primary_field(), Some(id));
2076    }
2077
2078    #[test]
2079    fn test_primary_with_other_attributes() {
2080        let sdl = r#"
2081            index documents {
2082                field id: text<simple> [primary, indexed, stored]
2083                field body: text [indexed]
2084            }
2085        "#;
2086
2087        let indexes = parse_sdl(sdl).unwrap();
2088        let id_field = &indexes[0].fields[0];
2089        assert!(id_field.primary);
2090        assert!(id_field.indexed);
2091        assert!(id_field.stored);
2092        assert!(id_field.fast);
2093        assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2094    }
2095
2096    #[test]
2097    fn test_primary_only_one_allowed() {
2098        let sdl = r#"
2099            index documents {
2100                field id: text [primary]
2101                field alt_id: text [primary]
2102            }
2103        "#;
2104
2105        let result = parse_sdl(sdl);
2106        assert!(result.is_err());
2107        let err = result.unwrap_err().to_string();
2108        assert!(
2109            err.contains("primary key"),
2110            "Error should mention primary key: {}",
2111            err
2112        );
2113    }
2114
2115    #[test]
2116    fn test_primary_must_be_text() {
2117        let sdl = r#"
2118            index documents {
2119                field id: u64 [primary]
2120            }
2121        "#;
2122
2123        let result = parse_sdl(sdl);
2124        assert!(result.is_err());
2125        let err = result.unwrap_err().to_string();
2126        assert!(
2127            err.contains("text"),
2128            "Error should mention text type: {}",
2129            err
2130        );
2131    }
2132
2133    #[test]
2134    fn test_primary_cannot_be_multi() {
2135        let sdl = r#"
2136            index documents {
2137                field id: text [primary, stored<multi>]
2138            }
2139        "#;
2140
2141        let result = parse_sdl(sdl);
2142        assert!(result.is_err());
2143        let err = result.unwrap_err().to_string();
2144        assert!(err.contains("multi"), "Error should mention multi: {}", err);
2145    }
2146
2147    #[test]
2148    fn test_no_primary_field() {
2149        // Schema without primary field should work fine
2150        let sdl = r#"
2151            index documents {
2152                field title: text [indexed, stored]
2153            }
2154        "#;
2155
2156        let indexes = parse_sdl(sdl).unwrap();
2157        let schema = indexes[0].to_schema();
2158        assert!(schema.primary_field().is_none());
2159    }
2160
2161    #[test]
2162    fn test_reorder_attribute() {
2163        let sdl = r#"
2164            index documents {
2165                field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>, reorder]
2166                field embedding2: sparse_vector [indexed<format: bmp>]
2167            }
2168        "#;
2169
2170        let indexes = parse_sdl(sdl).unwrap();
2171        assert_eq!(indexes[0].fields.len(), 2);
2172
2173        // First field should have reorder=true
2174        assert!(indexes[0].fields[0].reorder);
2175        // Second field should have reorder=false
2176        assert!(!indexes[0].fields[1].reorder);
2177
2178        // Verify schema roundtrip
2179        let schema = indexes[0].to_schema();
2180        let f1 = schema.get_field("embedding").unwrap();
2181        assert!(schema.get_field_entry(f1).unwrap().reorder);
2182
2183        let f2 = schema.get_field("embedding2").unwrap();
2184        assert!(!schema.get_field_entry(f2).unwrap().reorder);
2185    }
2186}