Skip to main content

hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//! }
35//! ```
36//!
37//! # Dense Vector Index Configuration
38//!
39//! Index-related parameters for dense vectors are specified in `indexed<...>`:
40//! - `rabitq` or `scann` - index type
41//! - `centroids: "path"` - path to pre-trained centroids file
42//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
43//! - `nprobe: N` - number of clusters to probe (default: 32)
44
45use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59    IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60    WeightQuantization,
61};
62
63/// Parsed field definition
64#[derive(Debug, Clone)]
65pub struct FieldDef {
66    pub name: String,
67    pub field_type: FieldType,
68    pub indexed: bool,
69    pub stored: bool,
70    /// Tokenizer name for text fields (e.g., "simple", "en_stem", "german")
71    pub tokenizer: Option<String>,
72    /// Whether this field can have multiple values (serialized as array in JSON)
73    pub multi: bool,
74    /// Position tracking mode for phrase queries and multi-field element tracking
75    pub positions: Option<super::schema::PositionMode>,
76    /// Configuration for sparse vector fields
77    pub sparse_vector_config: Option<SparseVectorConfig>,
78    /// Configuration for dense vector fields
79    pub dense_vector_config: Option<DenseVectorConfig>,
80    /// Whether this field has columnar fast-field storage
81    pub fast: bool,
82    /// Whether this field is a primary key (unique constraint)
83    pub primary: bool,
84    /// Whether this sparse_vector field has auto-computed SimHash for BMP block reordering
85    pub simhash: bool,
86}
87
88/// Parsed index definition
89#[derive(Debug, Clone)]
90pub struct IndexDef {
91    pub name: String,
92    pub fields: Vec<FieldDef>,
93    pub default_fields: Vec<String>,
94    /// Query router rules for routing queries to specific fields
95    pub query_routers: Vec<QueryRouterRule>,
96}
97
98impl IndexDef {
99    /// Convert to a Schema
100    pub fn to_schema(&self) -> Schema {
101        let mut builder = SchemaBuilder::default();
102
103        for field in &self.fields {
104            let f = match field.field_type {
105                FieldType::Text => {
106                    let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
107                    builder.add_text_field_with_tokenizer(
108                        &field.name,
109                        field.indexed,
110                        field.stored,
111                        tokenizer,
112                    )
113                }
114                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
115                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
116                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
117                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
118                FieldType::Json => builder.add_json_field(&field.name, field.stored),
119                FieldType::SparseVector => {
120                    if let Some(config) = &field.sparse_vector_config {
121                        builder.add_sparse_vector_field_with_config(
122                            &field.name,
123                            field.indexed,
124                            field.stored,
125                            config.clone(),
126                        )
127                    } else {
128                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
129                    }
130                }
131                FieldType::DenseVector => {
132                    // Dense vector dimension must be specified via config
133                    let config = field
134                        .dense_vector_config
135                        .as_ref()
136                        .expect("DenseVector field requires dimension to be specified");
137                    builder.add_dense_vector_field_with_config(
138                        &field.name,
139                        field.indexed,
140                        field.stored,
141                        config.clone(),
142                    )
143                }
144            };
145            if field.multi {
146                builder.set_multi(f, true);
147            }
148            if field.fast {
149                builder.set_fast(f, true);
150            }
151            if field.primary {
152                builder.set_primary_key(f);
153            }
154            if field.simhash {
155                builder.set_simhash(f, true);
156            }
157            // Set positions: explicit > auto (ordinal for multi vectors)
158            let positions = field.positions.or({
159                // Auto-set ordinal positions for multi-valued vector fields
160                if field.multi
161                    && matches!(
162                        field.field_type,
163                        FieldType::SparseVector | FieldType::DenseVector
164                    )
165                {
166                    Some(super::schema::PositionMode::Ordinal)
167                } else {
168                    None
169                }
170            });
171            if let Some(mode) = positions {
172                builder.set_positions(f, mode);
173            }
174        }
175
176        // Set default fields if specified
177        if !self.default_fields.is_empty() {
178            builder.set_default_fields(self.default_fields.clone());
179        }
180
181        // Set query routers if specified
182        if !self.query_routers.is_empty() {
183            builder.set_query_routers(self.query_routers.clone());
184        }
185
186        builder.build()
187    }
188
189    /// Create a QueryFieldRouter from the query router rules
190    ///
191    /// Returns None if there are no query router rules defined.
192    /// Returns Err if any regex pattern is invalid.
193    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
194        if self.query_routers.is_empty() {
195            return Ok(None);
196        }
197
198        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
199            .map(Some)
200            .map_err(Error::Schema)
201    }
202}
203
204/// Parse field type from string
205fn parse_field_type(type_str: &str) -> Result<FieldType> {
206    match type_str {
207        "text" | "string" | "str" => Ok(FieldType::Text),
208        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
209        "i64" | "int" | "integer" => Ok(FieldType::I64),
210        "f64" | "float" | "double" => Ok(FieldType::F64),
211        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
212        "json" => Ok(FieldType::Json),
213        "sparse_vector" => Ok(FieldType::SparseVector),
214        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
215        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
216    }
217}
218
219/// Index configuration parsed from indexed<...> attribute
220#[derive(Debug, Clone, Default)]
221struct IndexConfig {
222    index_type: Option<super::schema::VectorIndexType>,
223    num_clusters: Option<usize>,
224    nprobe: Option<usize>,
225    build_threshold: Option<usize>,
226    // Sparse vector index params
227    sparse_format: Option<SparseFormat>,
228    quantization: Option<WeightQuantization>,
229    weight_threshold: Option<f32>,
230    block_size: Option<usize>,
231    pruning: Option<f32>,
232    min_terms: Option<usize>,
233    // Sparse vector query-time config
234    query_tokenizer: Option<String>,
235    query_weighting: Option<QueryWeighting>,
236    query_weight_threshold: Option<f32>,
237    query_max_dims: Option<usize>,
238    query_pruning: Option<f32>,
239    query_min_query_dims: Option<usize>,
240    // BMP fixed dims (vocabulary size) and max weight scale
241    dims: Option<u32>,
242    max_weight: Option<f32>,
243    // Position tracking mode for phrase queries
244    positions: Option<super::schema::PositionMode>,
245}
246
247/// Parse attributes from pest pair
248/// Returns (indexed, stored, multi, fast, primary, simhash, index_config)
249/// positions is now inside index_config (via indexed<positions> or indexed<ordinal> etc.)
250/// multi is now inside stored<multi>
251fn parse_attributes(
252    pair: pest::iterators::Pair<Rule>,
253) -> (bool, bool, bool, bool, bool, bool, Option<IndexConfig>) {
254    let mut indexed = false;
255    let mut stored = false;
256    let mut multi = false;
257    let mut fast = false;
258    let mut primary = false;
259    let mut simhash = false;
260    let mut index_config = None;
261
262    for attr in pair.into_inner() {
263        if attr.as_rule() == Rule::attribute {
264            // attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" | "fast" | "primary" | "simhash" }
265            let mut found_config = false;
266            for inner in attr.clone().into_inner() {
267                match inner.as_rule() {
268                    Rule::indexed_with_config => {
269                        indexed = true;
270                        index_config = Some(parse_index_config(inner));
271                        found_config = true;
272                        break;
273                    }
274                    Rule::stored_with_config => {
275                        stored = true;
276                        multi = true; // stored<multi>
277                        found_config = true;
278                        break;
279                    }
280                    _ => {}
281                }
282            }
283            if !found_config {
284                // Simple attribute
285                match attr.as_str() {
286                    "indexed" => indexed = true,
287                    "stored" => stored = true,
288                    "fast" => fast = true,
289                    "primary" => primary = true,
290                    "simhash" => {
291                        simhash = true;
292                    }
293                    _ => {}
294                }
295            }
296        }
297    }
298
299    (indexed, stored, multi, fast, primary, simhash, index_config)
300}
301
302/// Parse index configuration from indexed<...> attribute
303fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
304    let mut config = IndexConfig::default();
305
306    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
307    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
308    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
309
310    for inner in pair.into_inner() {
311        if inner.as_rule() == Rule::index_config_params {
312            for param in inner.into_inner() {
313                if param.as_rule() == Rule::index_config_param {
314                    for p in param.into_inner() {
315                        parse_single_index_config_param(&mut config, p);
316                    }
317                }
318            }
319        }
320    }
321
322    config
323}
324
325/// Parse a single index config parameter
326fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
327    use super::schema::VectorIndexType;
328
329    match p.as_rule() {
330        Rule::index_type_spec => {
331            config.index_type = Some(match p.as_str() {
332                "flat" => VectorIndexType::Flat,
333                "rabitq" => VectorIndexType::RaBitQ,
334                "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
335                "scann" => VectorIndexType::ScaNN,
336                _ => VectorIndexType::RaBitQ,
337            });
338        }
339        Rule::index_type_kwarg => {
340            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
341            if let Some(t) = p.into_inner().next() {
342                config.index_type = Some(match t.as_str() {
343                    "flat" => VectorIndexType::Flat,
344                    "rabitq" => VectorIndexType::RaBitQ,
345                    "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
346                    "scann" => VectorIndexType::ScaNN,
347                    _ => VectorIndexType::RaBitQ,
348                });
349            }
350        }
351        Rule::num_clusters_kwarg => {
352            // num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
353            if let Some(n) = p.into_inner().next() {
354                config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
355                    log::warn!(
356                        "Invalid num_clusters value '{}', using default 256",
357                        n.as_str()
358                    );
359                    256
360                }));
361            }
362        }
363        Rule::build_threshold_kwarg => {
364            // build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
365            if let Some(n) = p.into_inner().next() {
366                config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
367                    log::warn!(
368                        "Invalid build_threshold value '{}', using default 10000",
369                        n.as_str()
370                    );
371                    10000
372                }));
373            }
374        }
375        Rule::nprobe_kwarg => {
376            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
377            if let Some(n) = p.into_inner().next() {
378                config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
379                    log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
380                    32
381                }));
382            }
383        }
384        Rule::quantization_kwarg => {
385            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
386            if let Some(q) = p.into_inner().next() {
387                config.quantization = Some(match q.as_str() {
388                    "float32" | "f32" => WeightQuantization::Float32,
389                    "float16" | "f16" => WeightQuantization::Float16,
390                    "uint8" | "u8" => WeightQuantization::UInt8,
391                    "uint4" | "u4" => WeightQuantization::UInt4,
392                    _ => WeightQuantization::default(),
393                });
394            }
395        }
396        Rule::weight_threshold_kwarg => {
397            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
398            if let Some(t) = p.into_inner().next() {
399                config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
400                    log::warn!(
401                        "Invalid weight_threshold value '{}', using default 0.0",
402                        t.as_str()
403                    );
404                    0.0
405                }));
406            }
407        }
408        Rule::block_size_kwarg => {
409            // block_size_kwarg = { "block_size" ~ ":" ~ block_size_spec }
410            if let Some(n) = p.into_inner().next() {
411                config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
412                    log::warn!(
413                        "Invalid block_size value '{}', using default 128",
414                        n.as_str()
415                    );
416                    128
417                }));
418            }
419        }
420        Rule::pruning_kwarg => {
421            // pruning_kwarg = { "pruning" ~ ":" ~ pruning_spec }
422            if let Some(f) = p.into_inner().next() {
423                config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
424                    log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
425                    1.0
426                }));
427            }
428        }
429        Rule::min_terms_kwarg => {
430            if let Some(n) = p.into_inner().next() {
431                config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
432                    log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
433                    4
434                }));
435            }
436        }
437        Rule::sparse_format_kwarg => {
438            // sparse_format_kwarg = { "format" ~ ":" ~ sparse_format_spec }
439            if let Some(f) = p.into_inner().next() {
440                config.sparse_format = Some(match f.as_str() {
441                    "bmp" => SparseFormat::Bmp,
442                    "maxscore" => SparseFormat::MaxScore,
443                    _ => SparseFormat::default(),
444                });
445            }
446        }
447        Rule::sparse_dims_kwarg => {
448            if let Some(n) = p.into_inner().next() {
449                config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
450                    log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
451                    105879
452                }));
453            }
454        }
455        Rule::sparse_max_weight_kwarg => {
456            if let Some(f) = p.into_inner().next() {
457                config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
458                    log::warn!(
459                        "Invalid max_weight value '{}', using default 5.0",
460                        f.as_str()
461                    );
462                    5.0
463                }));
464            }
465        }
466        Rule::query_config_block => {
467            // query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
468            parse_query_config_block(config, p);
469        }
470        Rule::positions_kwarg => {
471            // positions_kwarg = { "positions" | "ordinal" | "token_position" }
472            use super::schema::PositionMode;
473            config.positions = Some(match p.as_str() {
474                "ordinal" => PositionMode::Ordinal,
475                "token_position" => PositionMode::TokenPosition,
476                _ => PositionMode::Full, // "positions" or any other value defaults to Full
477            });
478        }
479        _ => {}
480    }
481}
482
483/// Parse query configuration block: query<tokenizer: "...", weighting: idf>
484fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
485    for inner in pair.into_inner() {
486        if inner.as_rule() == Rule::query_config_params {
487            for param in inner.into_inner() {
488                if param.as_rule() == Rule::query_config_param {
489                    for p in param.into_inner() {
490                        match p.as_rule() {
491                            Rule::query_tokenizer_kwarg => {
492                                // query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
493                                if let Some(path) = p.into_inner().next()
494                                    && let Some(inner_path) = path.into_inner().next()
495                                {
496                                    config.query_tokenizer = Some(inner_path.as_str().to_string());
497                                }
498                            }
499                            Rule::query_weighting_kwarg => {
500                                // query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
501                                if let Some(w) = p.into_inner().next() {
502                                    config.query_weighting = Some(match w.as_str() {
503                                        "one" => QueryWeighting::One,
504                                        "idf" => QueryWeighting::Idf,
505                                        "idf_file" => QueryWeighting::IdfFile,
506                                        _ => QueryWeighting::One,
507                                    });
508                                }
509                            }
510                            Rule::query_weight_threshold_kwarg => {
511                                if let Some(t) = p.into_inner().next() {
512                                    config.query_weight_threshold =
513                                        Some(t.as_str().parse().unwrap_or_else(|_| {
514                                            log::warn!(
515                                                "Invalid query weight_threshold '{}', using 0.0",
516                                                t.as_str()
517                                            );
518                                            0.0
519                                        }));
520                                }
521                            }
522                            Rule::query_max_dims_kwarg => {
523                                if let Some(t) = p.into_inner().next() {
524                                    config.query_max_dims =
525                                        Some(t.as_str().parse().unwrap_or_else(|_| {
526                                            log::warn!(
527                                                "Invalid query max_dims '{}', using 0",
528                                                t.as_str()
529                                            );
530                                            0
531                                        }));
532                                }
533                            }
534                            Rule::query_pruning_kwarg => {
535                                if let Some(t) = p.into_inner().next() {
536                                    config.query_pruning =
537                                        Some(t.as_str().parse().unwrap_or_else(|_| {
538                                            log::warn!(
539                                                "Invalid query pruning '{}', using 1.0",
540                                                t.as_str()
541                                            );
542                                            1.0
543                                        }));
544                                }
545                            }
546                            Rule::query_min_query_dims_kwarg => {
547                                if let Some(t) = p.into_inner().next() {
548                                    config.query_min_query_dims =
549                                        Some(t.as_str().parse().unwrap_or_else(|_| {
550                                            log::warn!(
551                                                "Invalid query min_query_dims '{}', using 4",
552                                                t.as_str()
553                                            );
554                                            4
555                                        }));
556                                }
557                            }
558                            _ => {}
559                        }
560                    }
561                }
562            }
563        }
564    }
565}
566
567/// Parse a field definition from pest pair
568fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
569    let mut inner = pair.into_inner();
570
571    let name = inner
572        .next()
573        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
574        .as_str()
575        .to_string();
576
577    let field_type_str = inner
578        .next()
579        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
580        .as_str();
581
582    let field_type = parse_field_type(field_type_str)?;
583
584    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
585    let mut tokenizer = None;
586    let mut sparse_vector_config = None;
587    let mut dense_vector_config = None;
588    let mut indexed = true;
589    let mut stored = true;
590    let mut multi = false;
591    let mut fast = false;
592    let mut primary = false;
593    let mut simhash = false;
594    let mut index_config: Option<IndexConfig> = None;
595
596    for item in inner {
597        match item.as_rule() {
598            Rule::tokenizer_spec => {
599                // Extract tokenizer name from <name>
600                if let Some(tok_name) = item.into_inner().next() {
601                    tokenizer = Some(tok_name.as_str().to_string());
602                }
603            }
604            Rule::sparse_vector_config => {
605                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
606                sparse_vector_config = Some(parse_sparse_vector_config(item));
607            }
608            Rule::dense_vector_config => {
609                // Parse dense_vector_params (keyword or positional) - only dims
610                dense_vector_config = Some(parse_dense_vector_config(item));
611            }
612            Rule::attributes => {
613                let (idx, sto, mul, fst, pri, sim, idx_cfg) = parse_attributes(item);
614                indexed = idx;
615                stored = sto;
616                multi = mul;
617                fast = fst;
618                primary = pri;
619                simhash = sim;
620                index_config = idx_cfg;
621            }
622            _ => {}
623        }
624    }
625
626    // Primary key implies fast + indexed (needed for dedup lookups)
627    if primary {
628        fast = true;
629        indexed = true;
630    }
631
632    // simhash requires sparse_vector — auto-computed from vector data during indexing
633    if simhash && field_type != FieldType::SparseVector {
634        return Err(Error::Schema(format!(
635            "simhash attribute on field '{}' requires type sparse_vector, got {:?}",
636            name, field_type
637        )));
638    }
639
640    // Merge index config into vector configs if both exist
641    let mut positions = None;
642    if let Some(idx_cfg) = index_config {
643        positions = idx_cfg.positions;
644        if let Some(ref mut dv_config) = dense_vector_config {
645            apply_index_config_to_dense_vector(dv_config, idx_cfg);
646        } else if field_type == FieldType::SparseVector {
647            // For sparse vectors, create default config if not present and apply index params
648            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
649            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
650        }
651    }
652
653    Ok(FieldDef {
654        name,
655        field_type,
656        indexed,
657        stored,
658        tokenizer,
659        multi,
660        positions,
661        sparse_vector_config,
662        dense_vector_config,
663        fast,
664        primary,
665        simhash,
666    })
667}
668
669/// Apply index configuration from indexed<...> to DenseVectorConfig
670fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
671    // Apply index type if specified
672    if let Some(index_type) = idx_cfg.index_type {
673        config.index_type = index_type;
674    }
675
676    // Apply num_clusters for IVF-based indexes
677    if idx_cfg.num_clusters.is_some() {
678        config.num_clusters = idx_cfg.num_clusters;
679    }
680
681    // Apply nprobe if specified
682    if let Some(nprobe) = idx_cfg.nprobe {
683        config.nprobe = nprobe;
684    }
685
686    // Apply build_threshold if specified
687    if idx_cfg.build_threshold.is_some() {
688        config.build_threshold = idx_cfg.build_threshold;
689    }
690}
691
692/// Parse sparse_vector_config - only index_size (positional)
693/// Example: <u16> or <u32>
694fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
695    let mut index_size = IndexSize::default();
696
697    // Parse positional index_size_spec
698    for inner in pair.into_inner() {
699        if inner.as_rule() == Rule::index_size_spec {
700            index_size = match inner.as_str() {
701                "u16" => IndexSize::U16,
702                "u32" => IndexSize::U32,
703                _ => IndexSize::default(),
704            };
705        }
706    }
707
708    SparseVectorConfig {
709        format: SparseFormat::default(),
710        index_size,
711        weight_quantization: WeightQuantization::default(),
712        weight_threshold: 0.0,
713        block_size: 128,
714        bmp_block_size: 64,
715        max_bmp_grid_bytes: 0,
716        bmp_superblock_size: 64,
717        pruning: None,
718        query_config: None,
719        dims: None,
720        max_weight: None,
721        min_terms: 4,
722    }
723}
724
725/// Apply index configuration from indexed<...> to SparseVectorConfig
726fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
727    if let Some(f) = idx_cfg.sparse_format {
728        config.format = f;
729    }
730    if let Some(q) = idx_cfg.quantization {
731        config.weight_quantization = q;
732    }
733    if let Some(t) = idx_cfg.weight_threshold {
734        config.weight_threshold = t;
735    }
736    if let Some(bs) = idx_cfg.block_size {
737        let adjusted = bs.next_power_of_two();
738        if adjusted != bs {
739            log::warn!(
740                "block_size {} adjusted to next power of two: {}",
741                bs,
742                adjusted
743            );
744        }
745        config.block_size = adjusted;
746    }
747    if let Some(p) = idx_cfg.pruning {
748        let clamped = p.clamp(0.0, 1.0);
749        if (clamped - p).abs() > f32::EPSILON {
750            log::warn!(
751                "pruning {} clamped to valid range [0.0, 1.0]: {}",
752                p,
753                clamped
754            );
755        }
756        config.pruning = Some(clamped);
757    }
758    if let Some(mt) = idx_cfg.min_terms {
759        config.min_terms = mt;
760    }
761    if let Some(d) = idx_cfg.dims {
762        config.dims = Some(d);
763    }
764    if let Some(mw) = idx_cfg.max_weight {
765        config.max_weight = Some(mw);
766    }
767    // Apply query-time configuration if present
768    if idx_cfg.query_tokenizer.is_some()
769        || idx_cfg.query_weighting.is_some()
770        || idx_cfg.query_weight_threshold.is_some()
771        || idx_cfg.query_max_dims.is_some()
772        || idx_cfg.query_pruning.is_some()
773        || idx_cfg.query_min_query_dims.is_some()
774    {
775        let query_config = config
776            .query_config
777            .get_or_insert(SparseQueryConfig::default());
778        if let Some(tokenizer) = idx_cfg.query_tokenizer {
779            query_config.tokenizer = Some(tokenizer);
780        }
781        if let Some(weighting) = idx_cfg.query_weighting {
782            query_config.weighting = weighting;
783        }
784        if let Some(t) = idx_cfg.query_weight_threshold {
785            query_config.weight_threshold = t;
786        }
787        if let Some(d) = idx_cfg.query_max_dims {
788            query_config.max_query_dims = Some(d);
789        }
790        if let Some(p) = idx_cfg.query_pruning {
791            query_config.pruning = Some(p);
792        }
793        if let Some(m) = idx_cfg.query_min_query_dims {
794            query_config.min_query_dims = m;
795        }
796    }
797}
798
799/// Parse dense_vector_config - dims and optional quantization type
800/// All index-related params are in indexed<...> attribute
801fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
802    let mut dim: usize = 0;
803    let mut quantization = DenseVectorQuantization::F32;
804
805    // Navigate to dense_vector_params
806    for params in pair.into_inner() {
807        if params.as_rule() == Rule::dense_vector_params {
808            for inner in params.into_inner() {
809                match inner.as_rule() {
810                    Rule::dense_vector_keyword_params => {
811                        for kwarg in inner.into_inner() {
812                            match kwarg.as_rule() {
813                                Rule::dims_kwarg => {
814                                    if let Some(d) = kwarg.into_inner().next() {
815                                        dim = d.as_str().parse().unwrap_or(0);
816                                    }
817                                }
818                                Rule::quant_type_spec => {
819                                    quantization = parse_quant_type(kwarg.as_str());
820                                }
821                                _ => {}
822                            }
823                        }
824                    }
825                    Rule::dense_vector_positional_params => {
826                        for item in inner.into_inner() {
827                            match item.as_rule() {
828                                Rule::dimension_spec => {
829                                    dim = item.as_str().parse().unwrap_or(0);
830                                }
831                                Rule::quant_type_spec => {
832                                    quantization = parse_quant_type(item.as_str());
833                                }
834                                _ => {}
835                            }
836                        }
837                    }
838                    _ => {}
839                }
840            }
841        }
842    }
843
844    DenseVectorConfig::new(dim).with_quantization(quantization)
845}
846
847fn parse_quant_type(s: &str) -> DenseVectorQuantization {
848    match s.trim() {
849        "f16" => DenseVectorQuantization::F16,
850        "uint8" | "u8" => DenseVectorQuantization::UInt8,
851        _ => DenseVectorQuantization::F32,
852    }
853}
854
855/// Parse default_fields definition
856fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
857    pair.into_inner().map(|p| p.as_str().to_string()).collect()
858}
859
860/// Parse a query router definition
861fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
862    let mut pattern = String::new();
863    let mut substitution = String::new();
864    let mut target_field = String::new();
865    let mut mode = RoutingMode::Additional;
866
867    for prop in pair.into_inner() {
868        if prop.as_rule() != Rule::query_router_prop {
869            continue;
870        }
871
872        for inner in prop.into_inner() {
873            match inner.as_rule() {
874                Rule::query_router_pattern => {
875                    if let Some(regex_str) = inner.into_inner().next() {
876                        pattern = parse_string_value(regex_str);
877                    }
878                }
879                Rule::query_router_substitution => {
880                    if let Some(quoted) = inner.into_inner().next() {
881                        substitution = parse_string_value(quoted);
882                    }
883                }
884                Rule::query_router_target => {
885                    if let Some(ident) = inner.into_inner().next() {
886                        target_field = ident.as_str().to_string();
887                    }
888                }
889                Rule::query_router_mode => {
890                    if let Some(mode_val) = inner.into_inner().next() {
891                        mode = match mode_val.as_str() {
892                            "exclusive" => RoutingMode::Exclusive,
893                            "additional" => RoutingMode::Additional,
894                            _ => RoutingMode::Additional,
895                        };
896                    }
897                }
898                _ => {}
899            }
900        }
901    }
902
903    if pattern.is_empty() {
904        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
905    }
906    if substitution.is_empty() {
907        return Err(Error::Schema(
908            "query_router missing 'substitution'".to_string(),
909        ));
910    }
911    if target_field.is_empty() {
912        return Err(Error::Schema(
913            "query_router missing 'target_field'".to_string(),
914        ));
915    }
916
917    Ok(QueryRouterRule {
918        pattern,
919        substitution,
920        target_field,
921        mode,
922    })
923}
924
925/// Parse a string value from quoted_string, raw_string, or regex_string
926fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
927    let s = pair.as_str();
928    match pair.as_rule() {
929        Rule::regex_string => {
930            // regex_string contains either raw_string or quoted_string
931            if let Some(inner) = pair.into_inner().next() {
932                parse_string_value(inner)
933            } else {
934                s.to_string()
935            }
936        }
937        Rule::raw_string => {
938            // r"..." - strip r" prefix and " suffix
939            s[2..s.len() - 1].to_string()
940        }
941        Rule::quoted_string => {
942            // "..." - strip quotes and handle escapes
943            let inner = &s[1..s.len() - 1];
944            // Simple escape handling
945            inner
946                .replace("\\n", "\n")
947                .replace("\\t", "\t")
948                .replace("\\\"", "\"")
949                .replace("\\\\", "\\")
950        }
951        _ => s.to_string(),
952    }
953}
954
955/// Parse an index definition from pest pair
956fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
957    let mut inner = pair.into_inner();
958
959    let name = inner
960        .next()
961        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
962        .as_str()
963        .to_string();
964
965    let mut fields = Vec::new();
966    let mut default_fields = Vec::new();
967    let mut query_routers = Vec::new();
968
969    for item in inner {
970        match item.as_rule() {
971            Rule::field_def => {
972                fields.push(parse_field_def(item)?);
973            }
974            Rule::default_fields_def => {
975                default_fields = parse_default_fields_def(item);
976            }
977            Rule::query_router_def => {
978                query_routers.push(parse_query_router_def(item)?);
979            }
980            _ => {}
981        }
982    }
983
984    // Validate primary key constraints
985    let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
986    if primary_fields.len() > 1 {
987        return Err(Error::Schema(format!(
988            "Index '{}' has {} primary key fields, but at most one is allowed",
989            name,
990            primary_fields.len()
991        )));
992    }
993    if let Some(pk) = primary_fields.first() {
994        if pk.field_type != FieldType::Text {
995            return Err(Error::Schema(format!(
996                "Primary key field '{}' must be of type text, got {:?}",
997                pk.name, pk.field_type
998            )));
999        }
1000        if pk.multi {
1001            return Err(Error::Schema(format!(
1002                "Primary key field '{}' cannot be multi-valued",
1003                pk.name
1004            )));
1005        }
1006    }
1007
1008    Ok(IndexDef {
1009        name,
1010        fields,
1011        default_fields,
1012        query_routers,
1013    })
1014}
1015
1016/// Parse SDL from a string
1017pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
1018    let pairs = SdlParser::parse(Rule::file, input)
1019        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1020
1021    let mut indexes = Vec::new();
1022
1023    for pair in pairs {
1024        if pair.as_rule() == Rule::file {
1025            for inner in pair.into_inner() {
1026                if inner.as_rule() == Rule::index_def {
1027                    indexes.push(parse_index_def(inner)?);
1028                }
1029            }
1030        }
1031    }
1032
1033    Ok(indexes)
1034}
1035
1036/// Parse SDL and return a single index definition
1037pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1038    let indexes = parse_sdl(input)?;
1039
1040    if indexes.is_empty() {
1041        return Err(Error::Schema("No index definition found".to_string()));
1042    }
1043
1044    if indexes.len() > 1 {
1045        return Err(Error::Schema(
1046            "Multiple index definitions found, expected one".to_string(),
1047        ));
1048    }
1049
1050    Ok(indexes.into_iter().next().unwrap())
1051}
1052
1053#[cfg(test)]
1054mod tests {
1055    use super::*;
1056
1057    #[test]
1058    fn test_parse_simple_schema() {
1059        let sdl = r#"
1060            index articles {
1061                field title: text [indexed, stored]
1062                field body: text [indexed]
1063            }
1064        "#;
1065
1066        let indexes = parse_sdl(sdl).unwrap();
1067        assert_eq!(indexes.len(), 1);
1068
1069        let index = &indexes[0];
1070        assert_eq!(index.name, "articles");
1071        assert_eq!(index.fields.len(), 2);
1072
1073        assert_eq!(index.fields[0].name, "title");
1074        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1075        assert!(index.fields[0].indexed);
1076        assert!(index.fields[0].stored);
1077
1078        assert_eq!(index.fields[1].name, "body");
1079        assert!(matches!(index.fields[1].field_type, FieldType::Text));
1080        assert!(index.fields[1].indexed);
1081        assert!(!index.fields[1].stored);
1082    }
1083
1084    #[test]
1085    fn test_parse_all_field_types() {
1086        let sdl = r#"
1087            index test {
1088                field text_field: text [indexed, stored]
1089                field u64_field: u64 [indexed, stored]
1090                field i64_field: i64 [indexed, stored]
1091                field f64_field: f64 [indexed, stored]
1092                field bytes_field: bytes [stored]
1093            }
1094        "#;
1095
1096        let indexes = parse_sdl(sdl).unwrap();
1097        let index = &indexes[0];
1098
1099        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1100        assert!(matches!(index.fields[1].field_type, FieldType::U64));
1101        assert!(matches!(index.fields[2].field_type, FieldType::I64));
1102        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1103        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1104    }
1105
1106    #[test]
1107    fn test_parse_with_comments() {
1108        let sdl = r#"
1109            # This is a comment
1110            index articles {
1111                # Title field
1112                field title: text [indexed, stored]
1113                field body: text [indexed] # inline comment not supported yet
1114            }
1115        "#;
1116
1117        let indexes = parse_sdl(sdl).unwrap();
1118        assert_eq!(indexes[0].fields.len(), 2);
1119    }
1120
1121    #[test]
1122    fn test_parse_type_aliases() {
1123        let sdl = r#"
1124            index test {
1125                field a: string [indexed]
1126                field b: int [indexed]
1127                field c: uint [indexed]
1128                field d: float [indexed]
1129                field e: binary [stored]
1130            }
1131        "#;
1132
1133        let indexes = parse_sdl(sdl).unwrap();
1134        let index = &indexes[0];
1135
1136        assert!(matches!(index.fields[0].field_type, FieldType::Text));
1137        assert!(matches!(index.fields[1].field_type, FieldType::I64));
1138        assert!(matches!(index.fields[2].field_type, FieldType::U64));
1139        assert!(matches!(index.fields[3].field_type, FieldType::F64));
1140        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1141    }
1142
1143    #[test]
1144    fn test_to_schema() {
1145        let sdl = r#"
1146            index articles {
1147                field title: text [indexed, stored]
1148                field views: u64 [indexed, stored]
1149            }
1150        "#;
1151
1152        let indexes = parse_sdl(sdl).unwrap();
1153        let schema = indexes[0].to_schema();
1154
1155        assert!(schema.get_field("title").is_some());
1156        assert!(schema.get_field("views").is_some());
1157        assert!(schema.get_field("nonexistent").is_none());
1158    }
1159
1160    #[test]
1161    fn test_default_attributes() {
1162        let sdl = r#"
1163            index test {
1164                field title: text
1165            }
1166        "#;
1167
1168        let indexes = parse_sdl(sdl).unwrap();
1169        let field = &indexes[0].fields[0];
1170
1171        // Default should be indexed and stored
1172        assert!(field.indexed);
1173        assert!(field.stored);
1174    }
1175
1176    #[test]
1177    fn test_multiple_indexes() {
1178        let sdl = r#"
1179            index articles {
1180                field title: text [indexed, stored]
1181            }
1182
1183            index users {
1184                field name: text [indexed, stored]
1185                field email: text [indexed, stored]
1186            }
1187        "#;
1188
1189        let indexes = parse_sdl(sdl).unwrap();
1190        assert_eq!(indexes.len(), 2);
1191        assert_eq!(indexes[0].name, "articles");
1192        assert_eq!(indexes[1].name, "users");
1193    }
1194
1195    #[test]
1196    fn test_tokenizer_spec() {
1197        let sdl = r#"
1198            index articles {
1199                field title: text<en_stem> [indexed, stored]
1200                field body: text<simple> [indexed]
1201                field author: text [indexed, stored]
1202            }
1203        "#;
1204
1205        let indexes = parse_sdl(sdl).unwrap();
1206        let index = &indexes[0];
1207
1208        assert_eq!(index.fields[0].name, "title");
1209        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1210
1211        assert_eq!(index.fields[1].name, "body");
1212        assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1213
1214        assert_eq!(index.fields[2].name, "author");
1215        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
1216    }
1217
1218    #[test]
1219    fn test_tokenizer_in_schema() {
1220        let sdl = r#"
1221            index articles {
1222                field title: text<german> [indexed, stored]
1223                field body: text<en_stem> [indexed]
1224            }
1225        "#;
1226
1227        let indexes = parse_sdl(sdl).unwrap();
1228        let schema = indexes[0].to_schema();
1229
1230        let title_field = schema.get_field("title").unwrap();
1231        let title_entry = schema.get_field_entry(title_field).unwrap();
1232        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1233
1234        let body_field = schema.get_field("body").unwrap();
1235        let body_entry = schema.get_field_entry(body_field).unwrap();
1236        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1237    }
1238
1239    #[test]
1240    fn test_query_router_basic() {
1241        let sdl = r#"
1242            index documents {
1243                field title: text [indexed, stored]
1244                field uri: text [indexed, stored]
1245
1246                query_router {
1247                    pattern: "10\\.\\d{4,}/[^\\s]+"
1248                    substitution: "doi://{0}"
1249                    target_field: uris
1250                    mode: exclusive
1251                }
1252            }
1253        "#;
1254
1255        let indexes = parse_sdl(sdl).unwrap();
1256        let index = &indexes[0];
1257
1258        assert_eq!(index.query_routers.len(), 1);
1259        let router = &index.query_routers[0];
1260        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1261        assert_eq!(router.substitution, "doi://{0}");
1262        assert_eq!(router.target_field, "uris");
1263        assert_eq!(router.mode, RoutingMode::Exclusive);
1264    }
1265
1266    #[test]
1267    fn test_query_router_raw_string() {
1268        let sdl = r#"
1269            index documents {
1270                field uris: text [indexed, stored]
1271
1272                query_router {
1273                    pattern: r"^pmid:(\d+)$"
1274                    substitution: "pubmed://{1}"
1275                    target_field: uris
1276                    mode: additional
1277                }
1278            }
1279        "#;
1280
1281        let indexes = parse_sdl(sdl).unwrap();
1282        let router = &indexes[0].query_routers[0];
1283
1284        assert_eq!(router.pattern, r"^pmid:(\d+)$");
1285        assert_eq!(router.substitution, "pubmed://{1}");
1286        assert_eq!(router.mode, RoutingMode::Additional);
1287    }
1288
1289    #[test]
1290    fn test_multiple_query_routers() {
1291        let sdl = r#"
1292            index documents {
1293                field uris: text [indexed, stored]
1294
1295                query_router {
1296                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1297                    substitution: "doi://{1}"
1298                    target_field: uris
1299                    mode: exclusive
1300                }
1301
1302                query_router {
1303                    pattern: r"^pmid:(\d+)$"
1304                    substitution: "pubmed://{1}"
1305                    target_field: uris
1306                    mode: exclusive
1307                }
1308
1309                query_router {
1310                    pattern: r"^arxiv:(\d+\.\d+)$"
1311                    substitution: "arxiv://{1}"
1312                    target_field: uris
1313                    mode: additional
1314                }
1315            }
1316        "#;
1317
1318        let indexes = parse_sdl(sdl).unwrap();
1319        assert_eq!(indexes[0].query_routers.len(), 3);
1320    }
1321
1322    #[test]
1323    fn test_query_router_default_mode() {
1324        let sdl = r#"
1325            index documents {
1326                field uris: text [indexed, stored]
1327
1328                query_router {
1329                    pattern: r"test"
1330                    substitution: "{0}"
1331                    target_field: uris
1332                }
1333            }
1334        "#;
1335
1336        let indexes = parse_sdl(sdl).unwrap();
1337        // Default mode should be Additional
1338        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1339    }
1340
1341    #[test]
1342    fn test_multi_attribute() {
1343        let sdl = r#"
1344            index documents {
1345                field uris: text [indexed, stored<multi>]
1346                field title: text [indexed, stored]
1347            }
1348        "#;
1349
1350        let indexes = parse_sdl(sdl).unwrap();
1351        assert_eq!(indexes.len(), 1);
1352
1353        let fields = &indexes[0].fields;
1354        assert_eq!(fields.len(), 2);
1355
1356        // uris should have multi=true
1357        assert_eq!(fields[0].name, "uris");
1358        assert!(fields[0].multi, "uris field should have multi=true");
1359
1360        // title should have multi=false
1361        assert_eq!(fields[1].name, "title");
1362        assert!(!fields[1].multi, "title field should have multi=false");
1363
1364        // Verify schema conversion preserves multi attribute
1365        let schema = indexes[0].to_schema();
1366        let uris_field = schema.get_field("uris").unwrap();
1367        let title_field = schema.get_field("title").unwrap();
1368
1369        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1370        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1371    }
1372
1373    #[test]
1374    fn test_sparse_vector_field() {
1375        let sdl = r#"
1376            index documents {
1377                field embedding: sparse_vector [indexed, stored]
1378            }
1379        "#;
1380
1381        let indexes = parse_sdl(sdl).unwrap();
1382        assert_eq!(indexes.len(), 1);
1383        assert_eq!(indexes[0].fields.len(), 1);
1384        assert_eq!(indexes[0].fields[0].name, "embedding");
1385        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1386        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1387    }
1388
1389    #[test]
1390    fn test_sparse_vector_with_config() {
1391        let sdl = r#"
1392            index documents {
1393                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1394                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1395            }
1396        "#;
1397
1398        let indexes = parse_sdl(sdl).unwrap();
1399        assert_eq!(indexes[0].fields.len(), 2);
1400
1401        // First field: u16 indices, uint8 quantization
1402        let f1 = &indexes[0].fields[0];
1403        assert_eq!(f1.name, "embedding");
1404        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1405        assert_eq!(config1.index_size, IndexSize::U16);
1406        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1407
1408        // Second field: u32 indices, float32 quantization
1409        let f2 = &indexes[0].fields[1];
1410        assert_eq!(f2.name, "dense");
1411        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1412        assert_eq!(config2.index_size, IndexSize::U32);
1413        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1414    }
1415
1416    #[test]
1417    fn test_sparse_vector_with_weight_threshold() {
1418        let sdl = r#"
1419            index documents {
1420                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1421                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1422            }
1423        "#;
1424
1425        let indexes = parse_sdl(sdl).unwrap();
1426        assert_eq!(indexes[0].fields.len(), 2);
1427
1428        // First field: u16 indices, uint8 quantization, threshold 0.1
1429        let f1 = &indexes[0].fields[0];
1430        assert_eq!(f1.name, "embedding");
1431        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1432        assert_eq!(config1.index_size, IndexSize::U16);
1433        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1434        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1435
1436        // Second field: u32 indices, float16 quantization, threshold 0.05
1437        let f2 = &indexes[0].fields[1];
1438        assert_eq!(f2.name, "embedding2");
1439        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1440        assert_eq!(config2.index_size, IndexSize::U32);
1441        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1442        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1443    }
1444
1445    #[test]
1446    fn test_sparse_vector_with_pruning() {
1447        let sdl = r#"
1448            index documents {
1449                field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1450            }
1451        "#;
1452
1453        let indexes = parse_sdl(sdl).unwrap();
1454        let f = &indexes[0].fields[0];
1455        assert_eq!(f.name, "embedding");
1456        let config = f.sparse_vector_config.as_ref().unwrap();
1457        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1458        assert_eq!(config.pruning, Some(0.1));
1459    }
1460
1461    #[test]
1462    fn test_dense_vector_field() {
1463        let sdl = r#"
1464            index documents {
1465                field embedding: dense_vector<768> [indexed, stored]
1466            }
1467        "#;
1468
1469        let indexes = parse_sdl(sdl).unwrap();
1470        assert_eq!(indexes.len(), 1);
1471        assert_eq!(indexes[0].fields.len(), 1);
1472
1473        let f = &indexes[0].fields[0];
1474        assert_eq!(f.name, "embedding");
1475        assert_eq!(f.field_type, FieldType::DenseVector);
1476
1477        let config = f.dense_vector_config.as_ref().unwrap();
1478        assert_eq!(config.dim, 768);
1479    }
1480
1481    #[test]
1482    fn test_dense_vector_alias() {
1483        let sdl = r#"
1484            index documents {
1485                field embedding: vector<1536> [indexed]
1486            }
1487        "#;
1488
1489        let indexes = parse_sdl(sdl).unwrap();
1490        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1491        assert_eq!(
1492            indexes[0].fields[0]
1493                .dense_vector_config
1494                .as_ref()
1495                .unwrap()
1496                .dim,
1497            1536
1498        );
1499    }
1500
1501    #[test]
1502    fn test_dense_vector_with_num_clusters() {
1503        let sdl = r#"
1504            index documents {
1505                field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1506            }
1507        "#;
1508
1509        let indexes = parse_sdl(sdl).unwrap();
1510        assert_eq!(indexes.len(), 1);
1511
1512        let f = &indexes[0].fields[0];
1513        assert_eq!(f.name, "embedding");
1514        assert_eq!(f.field_type, FieldType::DenseVector);
1515
1516        let config = f.dense_vector_config.as_ref().unwrap();
1517        assert_eq!(config.dim, 768);
1518        assert_eq!(config.num_clusters, Some(256));
1519        assert_eq!(config.nprobe, 32); // default
1520    }
1521
1522    #[test]
1523    fn test_dense_vector_with_num_clusters_and_nprobe() {
1524        let sdl = r#"
1525            index documents {
1526                field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1527            }
1528        "#;
1529
1530        let indexes = parse_sdl(sdl).unwrap();
1531        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1532
1533        assert_eq!(config.dim, 1536);
1534        assert_eq!(config.num_clusters, Some(512));
1535        assert_eq!(config.nprobe, 64);
1536    }
1537
1538    #[test]
1539    fn test_dense_vector_keyword_syntax() {
1540        let sdl = r#"
1541            index documents {
1542                field embedding: dense_vector<dims: 1536> [indexed, stored]
1543            }
1544        "#;
1545
1546        let indexes = parse_sdl(sdl).unwrap();
1547        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1548
1549        assert_eq!(config.dim, 1536);
1550        assert!(config.num_clusters.is_none());
1551    }
1552
1553    #[test]
1554    fn test_dense_vector_keyword_syntax_full() {
1555        let sdl = r#"
1556            index documents {
1557                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1558            }
1559        "#;
1560
1561        let indexes = parse_sdl(sdl).unwrap();
1562        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1563
1564        assert_eq!(config.dim, 1536);
1565        assert_eq!(config.num_clusters, Some(256));
1566        assert_eq!(config.nprobe, 64);
1567    }
1568
1569    #[test]
1570    fn test_dense_vector_keyword_syntax_partial() {
1571        let sdl = r#"
1572            index documents {
1573                field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1574            }
1575        "#;
1576
1577        let indexes = parse_sdl(sdl).unwrap();
1578        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1579
1580        assert_eq!(config.dim, 768);
1581        assert_eq!(config.num_clusters, Some(128));
1582        assert_eq!(config.nprobe, 32); // default
1583    }
1584
1585    #[test]
1586    fn test_dense_vector_scann_index() {
1587        use crate::dsl::schema::VectorIndexType;
1588
1589        let sdl = r#"
1590            index documents {
1591                field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1592            }
1593        "#;
1594
1595        let indexes = parse_sdl(sdl).unwrap();
1596        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1597
1598        assert_eq!(config.dim, 768);
1599        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1600        assert_eq!(config.num_clusters, Some(256));
1601        assert_eq!(config.nprobe, 64);
1602    }
1603
1604    #[test]
1605    fn test_dense_vector_ivf_rabitq_index() {
1606        use crate::dsl::schema::VectorIndexType;
1607
1608        let sdl = r#"
1609            index documents {
1610                field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1611            }
1612        "#;
1613
1614        let indexes = parse_sdl(sdl).unwrap();
1615        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1616
1617        assert_eq!(config.dim, 1536);
1618        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1619        assert_eq!(config.num_clusters, Some(512));
1620    }
1621
1622    #[test]
1623    fn test_dense_vector_rabitq_no_clusters() {
1624        use crate::dsl::schema::VectorIndexType;
1625
1626        let sdl = r#"
1627            index documents {
1628                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1629            }
1630        "#;
1631
1632        let indexes = parse_sdl(sdl).unwrap();
1633        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1634
1635        assert_eq!(config.dim, 768);
1636        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1637        assert!(config.num_clusters.is_none());
1638    }
1639
1640    #[test]
1641    fn test_dense_vector_flat_index() {
1642        use crate::dsl::schema::VectorIndexType;
1643
1644        let sdl = r#"
1645            index documents {
1646                field embedding: dense_vector<dims: 768> [indexed<flat>]
1647            }
1648        "#;
1649
1650        let indexes = parse_sdl(sdl).unwrap();
1651        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1652
1653        assert_eq!(config.dim, 768);
1654        assert_eq!(config.index_type, VectorIndexType::Flat);
1655    }
1656
1657    #[test]
1658    fn test_dense_vector_default_index_type() {
1659        use crate::dsl::schema::VectorIndexType;
1660
1661        // When no index type specified, should default to RaBitQ (basic)
1662        let sdl = r#"
1663            index documents {
1664                field embedding: dense_vector<dims: 768> [indexed]
1665            }
1666        "#;
1667
1668        let indexes = parse_sdl(sdl).unwrap();
1669        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1670
1671        assert_eq!(config.dim, 768);
1672        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1673    }
1674
1675    #[test]
1676    fn test_dense_vector_f16_quantization() {
1677        use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1678
1679        let sdl = r#"
1680            index documents {
1681                field embedding: dense_vector<768, f16> [indexed]
1682            }
1683        "#;
1684
1685        let indexes = parse_sdl(sdl).unwrap();
1686        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1687
1688        assert_eq!(config.dim, 768);
1689        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1690        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1691    }
1692
1693    #[test]
1694    fn test_dense_vector_uint8_quantization() {
1695        use crate::dsl::schema::DenseVectorQuantization;
1696
1697        let sdl = r#"
1698            index documents {
1699                field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1700            }
1701        "#;
1702
1703        let indexes = parse_sdl(sdl).unwrap();
1704        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1705
1706        assert_eq!(config.dim, 1024);
1707        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1708    }
1709
1710    #[test]
1711    fn test_dense_vector_u8_alias() {
1712        use crate::dsl::schema::DenseVectorQuantization;
1713
1714        let sdl = r#"
1715            index documents {
1716                field embedding: dense_vector<512, u8> [indexed]
1717            }
1718        "#;
1719
1720        let indexes = parse_sdl(sdl).unwrap();
1721        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1722
1723        assert_eq!(config.dim, 512);
1724        assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1725    }
1726
1727    #[test]
1728    fn test_dense_vector_default_f32_quantization() {
1729        use crate::dsl::schema::DenseVectorQuantization;
1730
1731        // No quantization type → default f32
1732        let sdl = r#"
1733            index documents {
1734                field embedding: dense_vector<768> [indexed]
1735            }
1736        "#;
1737
1738        let indexes = parse_sdl(sdl).unwrap();
1739        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1740
1741        assert_eq!(config.dim, 768);
1742        assert_eq!(config.quantization, DenseVectorQuantization::F32);
1743    }
1744
1745    #[test]
1746    fn test_dense_vector_keyword_with_quantization() {
1747        use crate::dsl::schema::DenseVectorQuantization;
1748
1749        let sdl = r#"
1750            index documents {
1751                field embedding: dense_vector<dims: 768, f16> [indexed]
1752            }
1753        "#;
1754
1755        let indexes = parse_sdl(sdl).unwrap();
1756        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1757
1758        assert_eq!(config.dim, 768);
1759        assert_eq!(config.quantization, DenseVectorQuantization::F16);
1760    }
1761
1762    #[test]
1763    fn test_json_field_type() {
1764        let sdl = r#"
1765            index documents {
1766                field title: text [indexed, stored]
1767                field metadata: json [stored]
1768                field extra: json
1769            }
1770        "#;
1771
1772        let indexes = parse_sdl(sdl).unwrap();
1773        let index = &indexes[0];
1774
1775        assert_eq!(index.fields.len(), 3);
1776
1777        // Check JSON field
1778        assert_eq!(index.fields[1].name, "metadata");
1779        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1780        assert!(index.fields[1].stored);
1781        // JSON fields should not be indexed (enforced by add_json_field)
1782
1783        // Check default attributes for JSON field
1784        assert_eq!(index.fields[2].name, "extra");
1785        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1786
1787        // Verify schema conversion
1788        let schema = index.to_schema();
1789        let metadata_field = schema.get_field("metadata").unwrap();
1790        let entry = schema.get_field_entry(metadata_field).unwrap();
1791        assert_eq!(entry.field_type, FieldType::Json);
1792        assert!(!entry.indexed); // JSON fields are never indexed
1793        assert!(entry.stored);
1794    }
1795
1796    #[test]
1797    fn test_sparse_vector_query_config() {
1798        use crate::structures::QueryWeighting;
1799
1800        let sdl = r#"
1801            index documents {
1802                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1803            }
1804        "#;
1805
1806        let indexes = parse_sdl(sdl).unwrap();
1807        let index = &indexes[0];
1808
1809        assert_eq!(index.fields.len(), 1);
1810        assert_eq!(index.fields[0].name, "embedding");
1811        assert!(matches!(
1812            index.fields[0].field_type,
1813            FieldType::SparseVector
1814        ));
1815
1816        let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1817        assert_eq!(config.index_size, IndexSize::U16);
1818        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1819
1820        // Check query config
1821        let query_config = config.query_config.as_ref().unwrap();
1822        assert_eq!(
1823            query_config.tokenizer.as_deref(),
1824            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1825        );
1826        assert_eq!(query_config.weighting, QueryWeighting::Idf);
1827
1828        // Verify schema conversion preserves query config
1829        let schema = index.to_schema();
1830        let embedding_field = schema.get_field("embedding").unwrap();
1831        let entry = schema.get_field_entry(embedding_field).unwrap();
1832        let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1833        let qc = sv_config.query_config.as_ref().unwrap();
1834        assert_eq!(
1835            qc.tokenizer.as_deref(),
1836            Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1837        );
1838        assert_eq!(qc.weighting, QueryWeighting::Idf);
1839    }
1840
1841    #[test]
1842    fn test_sparse_vector_query_config_weighting_one() {
1843        use crate::structures::QueryWeighting;
1844
1845        let sdl = r#"
1846            index documents {
1847                field embedding: sparse_vector [indexed<query<weighting: one>>]
1848            }
1849        "#;
1850
1851        let indexes = parse_sdl(sdl).unwrap();
1852        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1853
1854        let query_config = config.query_config.as_ref().unwrap();
1855        assert!(query_config.tokenizer.is_none());
1856        assert_eq!(query_config.weighting, QueryWeighting::One);
1857    }
1858
1859    #[test]
1860    fn test_sparse_vector_query_config_weighting_idf_file() {
1861        use crate::structures::QueryWeighting;
1862
1863        let sdl = r#"
1864            index documents {
1865                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1866            }
1867        "#;
1868
1869        let indexes = parse_sdl(sdl).unwrap();
1870        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1871
1872        let query_config = config.query_config.as_ref().unwrap();
1873        assert_eq!(
1874            query_config.tokenizer.as_deref(),
1875            Some("opensearch-neural-sparse-encoding-v1")
1876        );
1877        assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1878
1879        // Verify schema conversion preserves idf_file
1880        let schema = indexes[0].to_schema();
1881        let field = schema.get_field("embedding").unwrap();
1882        let entry = schema.get_field_entry(field).unwrap();
1883        let sc = entry.sparse_vector_config.as_ref().unwrap();
1884        let qc = sc.query_config.as_ref().unwrap();
1885        assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1886    }
1887
1888    #[test]
1889    fn test_sparse_vector_query_config_pruning_params() {
1890        let sdl = r#"
1891            index documents {
1892                field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1893            }
1894        "#;
1895
1896        let indexes = parse_sdl(sdl).unwrap();
1897        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1898
1899        let qc = config.query_config.as_ref().unwrap();
1900        assert_eq!(qc.weighting, QueryWeighting::Idf);
1901        assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1902        assert_eq!(qc.max_query_dims, Some(25));
1903        assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1904
1905        // Verify schema roundtrip
1906        let schema = indexes[0].to_schema();
1907        let field = schema.get_field("embedding").unwrap();
1908        let entry = schema.get_field_entry(field).unwrap();
1909        let sc = entry.sparse_vector_config.as_ref().unwrap();
1910        let rqc = sc.query_config.as_ref().unwrap();
1911        assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1912        assert_eq!(rqc.max_query_dims, Some(25));
1913        assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1914    }
1915
1916    #[test]
1917    fn test_sparse_vector_format_maxscore() {
1918        let sdl = r#"
1919            index documents {
1920                field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1921            }
1922        "#;
1923
1924        let indexes = parse_sdl(sdl).unwrap();
1925        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1926        assert_eq!(config.format, SparseFormat::MaxScore);
1927        assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1928
1929        // Verify schema roundtrip
1930        let schema = indexes[0].to_schema();
1931        let field = schema.get_field("embedding").unwrap();
1932        let entry = schema.get_field_entry(field).unwrap();
1933        let sc = entry.sparse_vector_config.as_ref().unwrap();
1934        assert_eq!(sc.format, SparseFormat::MaxScore);
1935    }
1936
1937    #[test]
1938    fn test_sparse_vector_format_bmp() {
1939        let sdl = r#"
1940            index documents {
1941                field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1942            }
1943        "#;
1944
1945        let indexes = parse_sdl(sdl).unwrap();
1946        let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1947        assert_eq!(config.format, SparseFormat::Bmp);
1948    }
1949
1950    #[test]
1951    fn test_fast_attribute() {
1952        let sdl = r#"
1953            index products {
1954                field name: text [indexed, stored]
1955                field price: f64 [indexed, fast]
1956                field category: text [indexed, stored, fast]
1957                field count: u64 [fast]
1958                field score: i64 [indexed, stored, fast]
1959            }
1960        "#;
1961
1962        let indexes = parse_sdl(sdl).unwrap();
1963        assert_eq!(indexes.len(), 1);
1964        let index = &indexes[0];
1965        assert_eq!(index.fields.len(), 5);
1966
1967        // name: no fast
1968        assert!(!index.fields[0].fast);
1969        // price: fast
1970        assert!(index.fields[1].fast);
1971        assert!(matches!(index.fields[1].field_type, FieldType::F64));
1972        // category: fast text
1973        assert!(index.fields[2].fast);
1974        assert!(matches!(index.fields[2].field_type, FieldType::Text));
1975        // count: fast only
1976        assert!(index.fields[3].fast);
1977        assert!(matches!(index.fields[3].field_type, FieldType::U64));
1978        // score: fast i64
1979        assert!(index.fields[4].fast);
1980        assert!(matches!(index.fields[4].field_type, FieldType::I64));
1981
1982        // Verify schema roundtrip preserves fast flag
1983        let schema = index.to_schema();
1984        let price_field = schema.get_field("price").unwrap();
1985        assert!(schema.get_field_entry(price_field).unwrap().fast);
1986
1987        let category_field = schema.get_field("category").unwrap();
1988        assert!(schema.get_field_entry(category_field).unwrap().fast);
1989
1990        let name_field = schema.get_field("name").unwrap();
1991        assert!(!schema.get_field_entry(name_field).unwrap().fast);
1992    }
1993
1994    #[test]
1995    fn test_primary_attribute() {
1996        let sdl = r#"
1997            index documents {
1998                field id: text [primary, stored]
1999                field title: text [indexed, stored]
2000            }
2001        "#;
2002
2003        let indexes = parse_sdl(sdl).unwrap();
2004        assert_eq!(indexes.len(), 1);
2005        let index = &indexes[0];
2006        assert_eq!(index.fields.len(), 2);
2007
2008        // id should be primary, and auto-set fast + indexed
2009        let id_field = &index.fields[0];
2010        assert!(id_field.primary, "id should be primary");
2011        assert!(id_field.fast, "primary implies fast");
2012        assert!(id_field.indexed, "primary implies indexed");
2013
2014        // title should NOT be primary
2015        assert!(!index.fields[1].primary);
2016
2017        // Verify schema conversion preserves primary_key
2018        let schema = index.to_schema();
2019        let id = schema.get_field("id").unwrap();
2020        let id_entry = schema.get_field_entry(id).unwrap();
2021        assert!(id_entry.primary_key);
2022        assert!(id_entry.fast);
2023        assert!(id_entry.indexed);
2024
2025        let title = schema.get_field("title").unwrap();
2026        assert!(!schema.get_field_entry(title).unwrap().primary_key);
2027
2028        // primary_field() should return the primary field
2029        assert_eq!(schema.primary_field(), Some(id));
2030    }
2031
2032    #[test]
2033    fn test_primary_with_other_attributes() {
2034        let sdl = r#"
2035            index documents {
2036                field id: text<simple> [primary, indexed, stored]
2037                field body: text [indexed]
2038            }
2039        "#;
2040
2041        let indexes = parse_sdl(sdl).unwrap();
2042        let id_field = &indexes[0].fields[0];
2043        assert!(id_field.primary);
2044        assert!(id_field.indexed);
2045        assert!(id_field.stored);
2046        assert!(id_field.fast);
2047        assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2048    }
2049
2050    #[test]
2051    fn test_primary_only_one_allowed() {
2052        let sdl = r#"
2053            index documents {
2054                field id: text [primary]
2055                field alt_id: text [primary]
2056            }
2057        "#;
2058
2059        let result = parse_sdl(sdl);
2060        assert!(result.is_err());
2061        let err = result.unwrap_err().to_string();
2062        assert!(
2063            err.contains("primary key"),
2064            "Error should mention primary key: {}",
2065            err
2066        );
2067    }
2068
2069    #[test]
2070    fn test_primary_must_be_text() {
2071        let sdl = r#"
2072            index documents {
2073                field id: u64 [primary]
2074            }
2075        "#;
2076
2077        let result = parse_sdl(sdl);
2078        assert!(result.is_err());
2079        let err = result.unwrap_err().to_string();
2080        assert!(
2081            err.contains("text"),
2082            "Error should mention text type: {}",
2083            err
2084        );
2085    }
2086
2087    #[test]
2088    fn test_primary_cannot_be_multi() {
2089        let sdl = r#"
2090            index documents {
2091                field id: text [primary, stored<multi>]
2092            }
2093        "#;
2094
2095        let result = parse_sdl(sdl);
2096        assert!(result.is_err());
2097        let err = result.unwrap_err().to_string();
2098        assert!(err.contains("multi"), "Error should mention multi: {}", err);
2099    }
2100
2101    #[test]
2102    fn test_no_primary_field() {
2103        // Schema without primary field should work fine
2104        let sdl = r#"
2105            index documents {
2106                field title: text [indexed, stored]
2107            }
2108        "#;
2109
2110        let indexes = parse_sdl(sdl).unwrap();
2111        let schema = indexes[0].to_schema();
2112        assert!(schema.primary_field().is_none());
2113    }
2114
2115    #[test]
2116    fn test_simhash_attribute_sparse_vector() {
2117        let sdl = r#"
2118            index documents {
2119                field embedding: sparse_vector<u32> [indexed<format: bmp, dims: 105879>, simhash]
2120            }
2121        "#;
2122
2123        let indexes = parse_sdl(sdl).unwrap();
2124        let index = &indexes[0];
2125        assert_eq!(index.fields.len(), 1);
2126
2127        let sh_field = &index.fields[0];
2128        assert_eq!(sh_field.name, "embedding");
2129        assert!(sh_field.simhash);
2130        assert!(!sh_field.fast, "simhash no longer implies fast");
2131        assert!(matches!(sh_field.field_type, FieldType::SparseVector));
2132
2133        // Verify schema conversion preserves simhash
2134        let schema = index.to_schema();
2135        let field = schema.get_field("embedding").unwrap();
2136        let entry = schema.get_field_entry(field).unwrap();
2137        assert!(entry.simhash);
2138    }
2139
2140    #[test]
2141    fn test_simhash_must_be_sparse_vector() {
2142        // u64 should fail now
2143        let sdl = r#"
2144            index documents {
2145                field simhash: u64 [simhash]
2146            }
2147        "#;
2148
2149        let result = parse_sdl(sdl);
2150        assert!(result.is_err());
2151        let err = result.unwrap_err().to_string();
2152        assert!(
2153            err.contains("sparse_vector"),
2154            "Error should mention sparse_vector: {}",
2155            err
2156        );
2157
2158        // text should fail too
2159        let sdl2 = r#"
2160            index documents {
2161                field simhash: text [simhash]
2162            }
2163        "#;
2164        let result2 = parse_sdl(sdl2);
2165        assert!(result2.is_err());
2166    }
2167
2168    #[test]
2169    fn test_simhash_multiple_fields() {
2170        let sdl = r#"
2171            index documents {
2172                field embed1: sparse_vector<u32> [indexed<format: bmp>, simhash]
2173                field embed2: sparse_vector<u32> [indexed<format: bmp>, simhash]
2174            }
2175        "#;
2176
2177        let indexes = parse_sdl(sdl).unwrap();
2178        let index = &indexes[0];
2179        assert!(index.fields[0].simhash);
2180        assert!(index.fields[1].simhash);
2181    }
2182
2183    #[test]
2184    fn test_no_simhash_field() {
2185        let sdl = r#"
2186            index documents {
2187                field title: text [indexed, stored]
2188            }
2189        "#;
2190
2191        let indexes = parse_sdl(sdl).unwrap();
2192        let schema = indexes[0].to_schema();
2193        // No field should have simhash set
2194        assert!(schema.fields().all(|(_, entry)| !entry.simhash));
2195    }
2196}