hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//!
31//!     # Dense vector with IVF-RaBitQ index
32//!     field embedding: dense_vector<768> [indexed<rabitq, centroids: "centroids.bin", nprobe: 32>]
33//!
34//!     # Dense vector with ScaNN index and MRL dimension
35//!     field embedding2: dense_vector<1536> [indexed<scann, centroids: "c.bin", codebook: "pq.bin", mrl_dim: 256>]
36//! }
37//! ```
38//!
39//! # Dense Vector Index Configuration
40//!
41//! Index-related parameters for dense vectors are specified in `indexed<...>`:
42//! - `rabitq` or `scann` - index type
43//! - `centroids: "path"` - path to pre-trained centroids file
44//! - `codebook: "path"` - path to PQ codebook (ScaNN only)
45//! - `nprobe: N` - number of clusters to probe (default: 32)
46//! - `mrl_dim: N` - Matryoshka dimension for index (uses truncated vectors)
47
48use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{IndexSize, SparseVectorConfig, WeightQuantization};
62
63/// Parsed field definition
64#[derive(Debug, Clone)]
65pub struct FieldDef {
66    pub name: String,
67    pub field_type: FieldType,
68    pub indexed: bool,
69    pub stored: bool,
70    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
71    pub tokenizer: Option<String>,
72    /// Whether this field can have multiple values (serialized as array in JSON)
73    pub multi: bool,
74    /// Configuration for sparse vector fields
75    pub sparse_vector_config: Option<SparseVectorConfig>,
76    /// Configuration for dense vector fields
77    pub dense_vector_config: Option<DenseVectorConfig>,
78}
79
80/// Parsed index definition
81#[derive(Debug, Clone)]
82pub struct IndexDef {
83    pub name: String,
84    pub fields: Vec<FieldDef>,
85    pub default_fields: Vec<String>,
86    /// Query router rules for routing queries to specific fields
87    pub query_routers: Vec<QueryRouterRule>,
88}
89
90impl IndexDef {
91    /// Convert to a Schema
92    pub fn to_schema(&self) -> Schema {
93        let mut builder = SchemaBuilder::default();
94
95        for field in &self.fields {
96            let f = match field.field_type {
97                FieldType::Text => {
98                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
99                    builder.add_text_field_with_tokenizer(
100                        &field.name,
101                        field.indexed,
102                        field.stored,
103                        tokenizer,
104                    )
105                }
106                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
107                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
108                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
109                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
110                FieldType::Json => builder.add_json_field(&field.name, field.stored),
111                FieldType::SparseVector => {
112                    if let Some(config) = &field.sparse_vector_config {
113                        builder.add_sparse_vector_field_with_config(
114                            &field.name,
115                            field.indexed,
116                            field.stored,
117                            *config,
118                        )
119                    } else {
120                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
121                    }
122                }
123                FieldType::DenseVector => {
124                    // Dense vector dimension must be specified via config
125                    let config = field
126                        .dense_vector_config
127                        .as_ref()
128                        .expect("DenseVector field requires dimension to be specified");
129                    builder.add_dense_vector_field_with_config(
130                        &field.name,
131                        field.indexed,
132                        field.stored,
133                        config.clone(),
134                    )
135                }
136            };
137            if field.multi {
138                builder.set_multi(f, true);
139            }
140        }
141
142        // Set default fields if specified
143        if !self.default_fields.is_empty() {
144            builder.set_default_fields(self.default_fields.clone());
145        }
146
147        // Set query routers if specified
148        if !self.query_routers.is_empty() {
149            builder.set_query_routers(self.query_routers.clone());
150        }
151
152        builder.build()
153    }
154
155    /// Create a QueryFieldRouter from the query router rules
156    ///
157    /// Returns None if there are no query router rules defined.
158    /// Returns Err if any regex pattern is invalid.
159    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
160        if self.query_routers.is_empty() {
161            return Ok(None);
162        }
163
164        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
165            .map(Some)
166            .map_err(Error::Schema)
167    }
168}
169
170/// Parse field type from string
171fn parse_field_type(type_str: &str) -> Result<FieldType> {
172    match type_str {
173        "text" | "string" | "str" => Ok(FieldType::Text),
174        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
175        "i64" | "int" | "integer" => Ok(FieldType::I64),
176        "f64" | "float" | "double" => Ok(FieldType::F64),
177        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
178        "json" => Ok(FieldType::Json),
179        "sparse_vector" => Ok(FieldType::SparseVector),
180        "dense_vector" | "vector" => Ok(FieldType::DenseVector),
181        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
182    }
183}
184
185/// Index configuration parsed from indexed<...> attribute
186#[derive(Debug, Clone, Default)]
187struct IndexConfig {
188    index_type: Option<super::schema::VectorIndexType>,
189    centroids_path: Option<String>,
190    codebook_path: Option<String>,
191    nprobe: Option<usize>,
192    mrl_dim: Option<usize>,
193    // Sparse vector index params
194    quantization: Option<WeightQuantization>,
195    weight_threshold: Option<f32>,
196}
197
198/// Parse attributes from pest pair
199/// Returns (indexed, stored, multi, index_config)
200fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
201    let mut indexed = false;
202    let mut stored = false;
203    let mut multi = false;
204    let mut index_config = None;
205
206    for attr in pair.into_inner() {
207        if attr.as_rule() == Rule::attribute {
208            // attribute = { indexed_with_config | "indexed" | "stored" | "multi" }
209            // Check if it contains indexed_with_config
210            let mut found_indexed_with_config = false;
211            for inner in attr.clone().into_inner() {
212                if inner.as_rule() == Rule::indexed_with_config {
213                    indexed = true;
214                    index_config = Some(parse_index_config(inner));
215                    found_indexed_with_config = true;
216                    break;
217                }
218            }
219            if !found_indexed_with_config {
220                // Simple attribute
221                match attr.as_str() {
222                    "indexed" => indexed = true,
223                    "stored" => stored = true,
224                    "multi" => multi = true,
225                    _ => {}
226                }
227            }
228        }
229    }
230
231    (indexed, stored, multi, index_config)
232}
233
234/// Parse index configuration from indexed<...> attribute
235fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
236    let mut config = IndexConfig::default();
237
238    // indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
239    // index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
240    // index_config_param = { index_type_kwarg | centroids_kwarg | codebook_kwarg | nprobe_kwarg | index_type_spec }
241
242    for inner in pair.into_inner() {
243        if inner.as_rule() == Rule::index_config_params {
244            for param in inner.into_inner() {
245                if param.as_rule() == Rule::index_config_param {
246                    for p in param.into_inner() {
247                        parse_single_index_config_param(&mut config, p);
248                    }
249                }
250            }
251        }
252    }
253
254    config
255}
256
257/// Parse a single index config parameter
258fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
259    use super::schema::VectorIndexType;
260
261    match p.as_rule() {
262        Rule::index_type_spec => {
263            config.index_type = Some(match p.as_str() {
264                "scann" => VectorIndexType::ScaNN,
265                "rabitq" => VectorIndexType::IvfRaBitQ,
266                _ => VectorIndexType::IvfRaBitQ,
267            });
268        }
269        Rule::index_type_kwarg => {
270            // index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
271            if let Some(t) = p.into_inner().next() {
272                config.index_type = Some(match t.as_str() {
273                    "scann" => VectorIndexType::ScaNN,
274                    "rabitq" => VectorIndexType::IvfRaBitQ,
275                    _ => VectorIndexType::IvfRaBitQ,
276                });
277            }
278        }
279        Rule::centroids_kwarg => {
280            // centroids_kwarg = { "centroids" ~ ":" ~ centroids_path }
281            // centroids_path = { "\"" ~ path_chars ~ "\"" }
282            if let Some(path) = p.into_inner().next()
283                && let Some(inner_path) = path.into_inner().next()
284            {
285                config.centroids_path = Some(inner_path.as_str().to_string());
286            }
287        }
288        Rule::codebook_kwarg => {
289            // codebook_kwarg = { "codebook" ~ ":" ~ codebook_path }
290            if let Some(path) = p.into_inner().next()
291                && let Some(inner_path) = path.into_inner().next()
292            {
293                config.codebook_path = Some(inner_path.as_str().to_string());
294            }
295        }
296        Rule::nprobe_kwarg => {
297            // nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
298            if let Some(n) = p.into_inner().next() {
299                config.nprobe = Some(n.as_str().parse().unwrap_or(32));
300            }
301        }
302        Rule::mrl_dim_kwarg => {
303            // mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
304            if let Some(n) = p.into_inner().next() {
305                config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
306            }
307        }
308        Rule::quantization_kwarg => {
309            // quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
310            if let Some(q) = p.into_inner().next() {
311                config.quantization = Some(match q.as_str() {
312                    "float32" | "f32" => WeightQuantization::Float32,
313                    "float16" | "f16" => WeightQuantization::Float16,
314                    "uint8" | "u8" => WeightQuantization::UInt8,
315                    "uint4" | "u4" => WeightQuantization::UInt4,
316                    _ => WeightQuantization::default(),
317                });
318            }
319        }
320        Rule::weight_threshold_kwarg => {
321            // weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
322            if let Some(t) = p.into_inner().next() {
323                config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
324            }
325        }
326        _ => {}
327    }
328}
329
330/// Parse a field definition from pest pair
331fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
332    let mut inner = pair.into_inner();
333
334    let name = inner
335        .next()
336        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
337        .as_str()
338        .to_string();
339
340    let field_type_str = inner
341        .next()
342        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
343        .as_str();
344
345    let field_type = parse_field_type(field_type_str)?;
346
347    // Parse optional tokenizer spec, sparse_vector_config, dense_vector_config, and attributes
348    let mut tokenizer = None;
349    let mut sparse_vector_config = None;
350    let mut dense_vector_config = None;
351    let mut indexed = true;
352    let mut stored = true;
353    let mut multi = false;
354    let mut index_config: Option<IndexConfig> = None;
355
356    for item in inner {
357        match item.as_rule() {
358            Rule::tokenizer_spec => {
359                // Extract tokenizer name from <name>
360                if let Some(tok_name) = item.into_inner().next() {
361                    tokenizer = Some(tok_name.as_str().to_string());
362                }
363            }
364            Rule::sparse_vector_config => {
365                // Parse named parameters: <index_size: u16, quantization: uint8, weight_threshold: 0.1>
366                sparse_vector_config = Some(parse_sparse_vector_config(item));
367            }
368            Rule::dense_vector_config => {
369                // Parse dense_vector_params (keyword or positional) - only dims and mrl_dim
370                dense_vector_config = Some(parse_dense_vector_config(item));
371            }
372            Rule::attributes => {
373                let (idx, sto, mul, idx_cfg) = parse_attributes(item);
374                indexed = idx;
375                stored = sto;
376                multi = mul;
377                index_config = idx_cfg;
378            }
379            _ => {}
380        }
381    }
382
383    // Merge index config into vector configs if both exist
384    if let Some(idx_cfg) = index_config {
385        if let Some(ref mut dv_config) = dense_vector_config {
386            apply_index_config_to_dense_vector(dv_config, idx_cfg);
387        } else if field_type == FieldType::SparseVector {
388            // For sparse vectors, create default config if not present and apply index params
389            let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
390            apply_index_config_to_sparse_vector(sv_config, idx_cfg);
391        }
392    }
393
394    Ok(FieldDef {
395        name,
396        field_type,
397        indexed,
398        stored,
399        tokenizer,
400        multi,
401        sparse_vector_config,
402        dense_vector_config,
403    })
404}
405
406/// Apply index configuration from indexed<...> to DenseVectorConfig
407fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
408    use super::schema::VectorIndexType;
409
410    let nprobe = idx_cfg.nprobe.unwrap_or(32);
411
412    match idx_cfg.index_type {
413        Some(VectorIndexType::ScaNN) => {
414            config.index_type = VectorIndexType::ScaNN;
415            config.coarse_centroids_path = idx_cfg.centroids_path;
416            config.pq_codebook_path = idx_cfg.codebook_path;
417            config.nprobe = nprobe;
418        }
419        Some(VectorIndexType::IvfRaBitQ) => {
420            config.index_type = VectorIndexType::IvfRaBitQ;
421            config.coarse_centroids_path = idx_cfg.centroids_path;
422            config.nprobe = nprobe;
423        }
424        Some(VectorIndexType::RaBitQ) | None => {
425            // If centroids provided, use IVF-RaBitQ, otherwise plain RaBitQ
426            if idx_cfg.centroids_path.is_some() {
427                config.index_type = VectorIndexType::IvfRaBitQ;
428                config.coarse_centroids_path = idx_cfg.centroids_path;
429                config.nprobe = nprobe;
430            }
431            // else keep default RaBitQ
432        }
433    }
434
435    // Apply mrl_dim if specified
436    if idx_cfg.mrl_dim.is_some() {
437        config.mrl_dim = idx_cfg.mrl_dim;
438    }
439}
440
441/// Parse sparse_vector_config - only index_size (positional)
442/// Example: <u16> or <u32>
443fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
444    let mut index_size = IndexSize::default();
445
446    // Parse positional index_size_spec
447    for inner in pair.into_inner() {
448        if inner.as_rule() == Rule::index_size_spec {
449            index_size = match inner.as_str() {
450                "u16" => IndexSize::U16,
451                "u32" => IndexSize::U32,
452                _ => IndexSize::default(),
453            };
454        }
455    }
456
457    SparseVectorConfig {
458        index_size,
459        weight_quantization: WeightQuantization::default(),
460        weight_threshold: 0.0,
461    }
462}
463
464/// Apply index configuration from indexed<...> to SparseVectorConfig
465fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
466    if let Some(q) = idx_cfg.quantization {
467        config.weight_quantization = q;
468    }
469    if let Some(t) = idx_cfg.weight_threshold {
470        config.weight_threshold = t;
471    }
472}
473
474/// Parse dense_vector_config - only dims
475/// All index-related params (including mrl_dim) are now in indexed<...> attribute
476fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
477    let mut dim: usize = 0;
478
479    // Navigate to dense_vector_params
480    for params in pair.into_inner() {
481        if params.as_rule() == Rule::dense_vector_params {
482            for inner in params.into_inner() {
483                match inner.as_rule() {
484                    Rule::dense_vector_keyword_params => {
485                        // Parse keyword args: dims: N
486                        for kwarg in inner.into_inner() {
487                            if kwarg.as_rule() == Rule::dims_kwarg
488                                && let Some(d) = kwarg.into_inner().next()
489                            {
490                                dim = d.as_str().parse().unwrap_or(0);
491                            }
492                        }
493                    }
494                    Rule::dense_vector_positional_params => {
495                        // Parse positional: just dimension
496                        if let Some(dim_pair) = inner.into_inner().next() {
497                            dim = dim_pair.as_str().parse().unwrap_or(0);
498                        }
499                    }
500                    _ => {}
501                }
502            }
503        }
504    }
505
506    DenseVectorConfig::new(dim)
507}
508
509/// Parse default_fields definition
510fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
511    pair.into_inner().map(|p| p.as_str().to_string()).collect()
512}
513
514/// Parse a query router definition
515fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
516    let mut pattern = String::new();
517    let mut substitution = String::new();
518    let mut target_field = String::new();
519    let mut mode = RoutingMode::Additional;
520
521    for prop in pair.into_inner() {
522        if prop.as_rule() != Rule::query_router_prop {
523            continue;
524        }
525
526        for inner in prop.into_inner() {
527            match inner.as_rule() {
528                Rule::query_router_pattern => {
529                    if let Some(regex_str) = inner.into_inner().next() {
530                        pattern = parse_string_value(regex_str);
531                    }
532                }
533                Rule::query_router_substitution => {
534                    if let Some(quoted) = inner.into_inner().next() {
535                        substitution = parse_string_value(quoted);
536                    }
537                }
538                Rule::query_router_target => {
539                    if let Some(ident) = inner.into_inner().next() {
540                        target_field = ident.as_str().to_string();
541                    }
542                }
543                Rule::query_router_mode => {
544                    if let Some(mode_val) = inner.into_inner().next() {
545                        mode = match mode_val.as_str() {
546                            "exclusive" => RoutingMode::Exclusive,
547                            "additional" => RoutingMode::Additional,
548                            _ => RoutingMode::Additional,
549                        };
550                    }
551                }
552                _ => {}
553            }
554        }
555    }
556
557    if pattern.is_empty() {
558        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
559    }
560    if substitution.is_empty() {
561        return Err(Error::Schema(
562            "query_router missing 'substitution'".to_string(),
563        ));
564    }
565    if target_field.is_empty() {
566        return Err(Error::Schema(
567            "query_router missing 'target_field'".to_string(),
568        ));
569    }
570
571    Ok(QueryRouterRule {
572        pattern,
573        substitution,
574        target_field,
575        mode,
576    })
577}
578
579/// Parse a string value from quoted_string, raw_string, or regex_string
580fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
581    let s = pair.as_str();
582    match pair.as_rule() {
583        Rule::regex_string => {
584            // regex_string contains either raw_string or quoted_string
585            if let Some(inner) = pair.into_inner().next() {
586                parse_string_value(inner)
587            } else {
588                s.to_string()
589            }
590        }
591        Rule::raw_string => {
592            // r"..." - strip r" prefix and " suffix
593            s[2..s.len() - 1].to_string()
594        }
595        Rule::quoted_string => {
596            // "..." - strip quotes and handle escapes
597            let inner = &s[1..s.len() - 1];
598            // Simple escape handling
599            inner
600                .replace("\\n", "\n")
601                .replace("\\t", "\t")
602                .replace("\\\"", "\"")
603                .replace("\\\\", "\\")
604        }
605        _ => s.to_string(),
606    }
607}
608
609/// Parse an index definition from pest pair
610fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
611    let mut inner = pair.into_inner();
612
613    let name = inner
614        .next()
615        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
616        .as_str()
617        .to_string();
618
619    let mut fields = Vec::new();
620    let mut default_fields = Vec::new();
621    let mut query_routers = Vec::new();
622
623    for item in inner {
624        match item.as_rule() {
625            Rule::field_def => {
626                fields.push(parse_field_def(item)?);
627            }
628            Rule::default_fields_def => {
629                default_fields = parse_default_fields_def(item);
630            }
631            Rule::query_router_def => {
632                query_routers.push(parse_query_router_def(item)?);
633            }
634            _ => {}
635        }
636    }
637
638    Ok(IndexDef {
639        name,
640        fields,
641        default_fields,
642        query_routers,
643    })
644}
645
646/// Parse SDL from a string
647pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
648    let pairs = SdlParser::parse(Rule::file, input)
649        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
650
651    let mut indexes = Vec::new();
652
653    for pair in pairs {
654        if pair.as_rule() == Rule::file {
655            for inner in pair.into_inner() {
656                if inner.as_rule() == Rule::index_def {
657                    indexes.push(parse_index_def(inner)?);
658                }
659            }
660        }
661    }
662
663    Ok(indexes)
664}
665
666/// Parse SDL and return a single index definition
667pub fn parse_single_index(input: &str) -> Result<IndexDef> {
668    let indexes = parse_sdl(input)?;
669
670    if indexes.is_empty() {
671        return Err(Error::Schema("No index definition found".to_string()));
672    }
673
674    if indexes.len() > 1 {
675        return Err(Error::Schema(
676            "Multiple index definitions found, expected one".to_string(),
677        ));
678    }
679
680    Ok(indexes.into_iter().next().unwrap())
681}
682
683#[cfg(test)]
684mod tests {
685    use super::*;
686
687    #[test]
688    fn test_parse_simple_schema() {
689        let sdl = r#"
690            index articles {
691                field title: text [indexed, stored]
692                field body: text [indexed]
693            }
694        "#;
695
696        let indexes = parse_sdl(sdl).unwrap();
697        assert_eq!(indexes.len(), 1);
698
699        let index = &indexes[0];
700        assert_eq!(index.name, "articles");
701        assert_eq!(index.fields.len(), 2);
702
703        assert_eq!(index.fields[0].name, "title");
704        assert!(matches!(index.fields[0].field_type, FieldType::Text));
705        assert!(index.fields[0].indexed);
706        assert!(index.fields[0].stored);
707
708        assert_eq!(index.fields[1].name, "body");
709        assert!(matches!(index.fields[1].field_type, FieldType::Text));
710        assert!(index.fields[1].indexed);
711        assert!(!index.fields[1].stored);
712    }
713
714    #[test]
715    fn test_parse_all_field_types() {
716        let sdl = r#"
717            index test {
718                field text_field: text [indexed, stored]
719                field u64_field: u64 [indexed, stored]
720                field i64_field: i64 [indexed, stored]
721                field f64_field: f64 [indexed, stored]
722                field bytes_field: bytes [stored]
723            }
724        "#;
725
726        let indexes = parse_sdl(sdl).unwrap();
727        let index = &indexes[0];
728
729        assert!(matches!(index.fields[0].field_type, FieldType::Text));
730        assert!(matches!(index.fields[1].field_type, FieldType::U64));
731        assert!(matches!(index.fields[2].field_type, FieldType::I64));
732        assert!(matches!(index.fields[3].field_type, FieldType::F64));
733        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
734    }
735
736    #[test]
737    fn test_parse_with_comments() {
738        let sdl = r#"
739            # This is a comment
740            index articles {
741                # Title field
742                field title: text [indexed, stored]
743                field body: text [indexed] # inline comment not supported yet
744            }
745        "#;
746
747        let indexes = parse_sdl(sdl).unwrap();
748        assert_eq!(indexes[0].fields.len(), 2);
749    }
750
751    #[test]
752    fn test_parse_type_aliases() {
753        let sdl = r#"
754            index test {
755                field a: string [indexed]
756                field b: int [indexed]
757                field c: uint [indexed]
758                field d: float [indexed]
759                field e: binary [stored]
760            }
761        "#;
762
763        let indexes = parse_sdl(sdl).unwrap();
764        let index = &indexes[0];
765
766        assert!(matches!(index.fields[0].field_type, FieldType::Text));
767        assert!(matches!(index.fields[1].field_type, FieldType::I64));
768        assert!(matches!(index.fields[2].field_type, FieldType::U64));
769        assert!(matches!(index.fields[3].field_type, FieldType::F64));
770        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
771    }
772
773    #[test]
774    fn test_to_schema() {
775        let sdl = r#"
776            index articles {
777                field title: text [indexed, stored]
778                field views: u64 [indexed, stored]
779            }
780        "#;
781
782        let indexes = parse_sdl(sdl).unwrap();
783        let schema = indexes[0].to_schema();
784
785        assert!(schema.get_field("title").is_some());
786        assert!(schema.get_field("views").is_some());
787        assert!(schema.get_field("nonexistent").is_none());
788    }
789
790    #[test]
791    fn test_default_attributes() {
792        let sdl = r#"
793            index test {
794                field title: text
795            }
796        "#;
797
798        let indexes = parse_sdl(sdl).unwrap();
799        let field = &indexes[0].fields[0];
800
801        // Default should be indexed and stored
802        assert!(field.indexed);
803        assert!(field.stored);
804    }
805
806    #[test]
807    fn test_multiple_indexes() {
808        let sdl = r#"
809            index articles {
810                field title: text [indexed, stored]
811            }
812
813            index users {
814                field name: text [indexed, stored]
815                field email: text [indexed, stored]
816            }
817        "#;
818
819        let indexes = parse_sdl(sdl).unwrap();
820        assert_eq!(indexes.len(), 2);
821        assert_eq!(indexes[0].name, "articles");
822        assert_eq!(indexes[1].name, "users");
823    }
824
825    #[test]
826    fn test_tokenizer_spec() {
827        let sdl = r#"
828            index articles {
829                field title: text<en_stem> [indexed, stored]
830                field body: text<default> [indexed]
831                field author: text [indexed, stored]
832            }
833        "#;
834
835        let indexes = parse_sdl(sdl).unwrap();
836        let index = &indexes[0];
837
838        assert_eq!(index.fields[0].name, "title");
839        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
840
841        assert_eq!(index.fields[1].name, "body");
842        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
843
844        assert_eq!(index.fields[2].name, "author");
845        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
846    }
847
848    #[test]
849    fn test_tokenizer_in_schema() {
850        let sdl = r#"
851            index articles {
852                field title: text<german> [indexed, stored]
853                field body: text<en_stem> [indexed]
854            }
855        "#;
856
857        let indexes = parse_sdl(sdl).unwrap();
858        let schema = indexes[0].to_schema();
859
860        let title_field = schema.get_field("title").unwrap();
861        let title_entry = schema.get_field_entry(title_field).unwrap();
862        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
863
864        let body_field = schema.get_field("body").unwrap();
865        let body_entry = schema.get_field_entry(body_field).unwrap();
866        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
867    }
868
869    #[test]
870    fn test_query_router_basic() {
871        let sdl = r#"
872            index documents {
873                field title: text [indexed, stored]
874                field uri: text [indexed, stored]
875
876                query_router {
877                    pattern: "10\\.\\d{4,}/[^\\s]+"
878                    substitution: "doi://{0}"
879                    target_field: uris
880                    mode: exclusive
881                }
882            }
883        "#;
884
885        let indexes = parse_sdl(sdl).unwrap();
886        let index = &indexes[0];
887
888        assert_eq!(index.query_routers.len(), 1);
889        let router = &index.query_routers[0];
890        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
891        assert_eq!(router.substitution, "doi://{0}");
892        assert_eq!(router.target_field, "uris");
893        assert_eq!(router.mode, RoutingMode::Exclusive);
894    }
895
896    #[test]
897    fn test_query_router_raw_string() {
898        let sdl = r#"
899            index documents {
900                field uris: text [indexed, stored]
901
902                query_router {
903                    pattern: r"^pmid:(\d+)$"
904                    substitution: "pubmed://{1}"
905                    target_field: uris
906                    mode: additional
907                }
908            }
909        "#;
910
911        let indexes = parse_sdl(sdl).unwrap();
912        let router = &indexes[0].query_routers[0];
913
914        assert_eq!(router.pattern, r"^pmid:(\d+)$");
915        assert_eq!(router.substitution, "pubmed://{1}");
916        assert_eq!(router.mode, RoutingMode::Additional);
917    }
918
919    #[test]
920    fn test_multiple_query_routers() {
921        let sdl = r#"
922            index documents {
923                field uris: text [indexed, stored]
924
925                query_router {
926                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
927                    substitution: "doi://{1}"
928                    target_field: uris
929                    mode: exclusive
930                }
931
932                query_router {
933                    pattern: r"^pmid:(\d+)$"
934                    substitution: "pubmed://{1}"
935                    target_field: uris
936                    mode: exclusive
937                }
938
939                query_router {
940                    pattern: r"^arxiv:(\d+\.\d+)$"
941                    substitution: "arxiv://{1}"
942                    target_field: uris
943                    mode: additional
944                }
945            }
946        "#;
947
948        let indexes = parse_sdl(sdl).unwrap();
949        assert_eq!(indexes[0].query_routers.len(), 3);
950    }
951
952    #[test]
953    fn test_query_router_default_mode() {
954        let sdl = r#"
955            index documents {
956                field uris: text [indexed, stored]
957
958                query_router {
959                    pattern: r"test"
960                    substitution: "{0}"
961                    target_field: uris
962                }
963            }
964        "#;
965
966        let indexes = parse_sdl(sdl).unwrap();
967        // Default mode should be Additional
968        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
969    }
970
971    #[test]
972    fn test_multi_attribute() {
973        let sdl = r#"
974            index documents {
975                field uris: text [indexed, stored, multi]
976                field title: text [indexed, stored]
977            }
978        "#;
979
980        let indexes = parse_sdl(sdl).unwrap();
981        assert_eq!(indexes.len(), 1);
982
983        let fields = &indexes[0].fields;
984        assert_eq!(fields.len(), 2);
985
986        // uris should have multi=true
987        assert_eq!(fields[0].name, "uris");
988        assert!(fields[0].multi, "uris field should have multi=true");
989
990        // title should have multi=false
991        assert_eq!(fields[1].name, "title");
992        assert!(!fields[1].multi, "title field should have multi=false");
993
994        // Verify schema conversion preserves multi attribute
995        let schema = indexes[0].to_schema();
996        let uris_field = schema.get_field("uris").unwrap();
997        let title_field = schema.get_field("title").unwrap();
998
999        assert!(schema.get_field_entry(uris_field).unwrap().multi);
1000        assert!(!schema.get_field_entry(title_field).unwrap().multi);
1001    }
1002
1003    #[test]
1004    fn test_sparse_vector_field() {
1005        let sdl = r#"
1006            index documents {
1007                field embedding: sparse_vector [indexed, stored]
1008            }
1009        "#;
1010
1011        let indexes = parse_sdl(sdl).unwrap();
1012        assert_eq!(indexes.len(), 1);
1013        assert_eq!(indexes[0].fields.len(), 1);
1014        assert_eq!(indexes[0].fields[0].name, "embedding");
1015        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1016        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1017    }
1018
1019    #[test]
1020    fn test_sparse_vector_with_config() {
1021        let sdl = r#"
1022            index documents {
1023                field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1024                field dense: sparse_vector<u32> [indexed<quantization: float32>]
1025            }
1026        "#;
1027
1028        let indexes = parse_sdl(sdl).unwrap();
1029        assert_eq!(indexes[0].fields.len(), 2);
1030
1031        // First field: u16 indices, uint8 quantization
1032        let f1 = &indexes[0].fields[0];
1033        assert_eq!(f1.name, "embedding");
1034        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1035        assert_eq!(config1.index_size, IndexSize::U16);
1036        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1037
1038        // Second field: u32 indices, float32 quantization
1039        let f2 = &indexes[0].fields[1];
1040        assert_eq!(f2.name, "dense");
1041        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1042        assert_eq!(config2.index_size, IndexSize::U32);
1043        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1044    }
1045
1046    #[test]
1047    fn test_sparse_vector_with_weight_threshold() {
1048        let sdl = r#"
1049            index documents {
1050                field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1051                field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1052            }
1053        "#;
1054
1055        let indexes = parse_sdl(sdl).unwrap();
1056        assert_eq!(indexes[0].fields.len(), 2);
1057
1058        // First field: u16 indices, uint8 quantization, threshold 0.1
1059        let f1 = &indexes[0].fields[0];
1060        assert_eq!(f1.name, "embedding");
1061        let config1 = f1.sparse_vector_config.as_ref().unwrap();
1062        assert_eq!(config1.index_size, IndexSize::U16);
1063        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1064        assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1065
1066        // Second field: u32 indices, float16 quantization, threshold 0.05
1067        let f2 = &indexes[0].fields[1];
1068        assert_eq!(f2.name, "embedding2");
1069        let config2 = f2.sparse_vector_config.as_ref().unwrap();
1070        assert_eq!(config2.index_size, IndexSize::U32);
1071        assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1072        assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1073    }
1074
1075    #[test]
1076    fn test_dense_vector_field() {
1077        let sdl = r#"
1078            index documents {
1079                field embedding: dense_vector<768> [indexed, stored]
1080            }
1081        "#;
1082
1083        let indexes = parse_sdl(sdl).unwrap();
1084        assert_eq!(indexes.len(), 1);
1085        assert_eq!(indexes[0].fields.len(), 1);
1086
1087        let f = &indexes[0].fields[0];
1088        assert_eq!(f.name, "embedding");
1089        assert_eq!(f.field_type, FieldType::DenseVector);
1090
1091        let config = f.dense_vector_config.as_ref().unwrap();
1092        assert_eq!(config.dim, 768);
1093    }
1094
1095    #[test]
1096    fn test_dense_vector_alias() {
1097        let sdl = r#"
1098            index documents {
1099                field embedding: vector<1536> [indexed]
1100            }
1101        "#;
1102
1103        let indexes = parse_sdl(sdl).unwrap();
1104        assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1105        assert_eq!(
1106            indexes[0].fields[0]
1107                .dense_vector_config
1108                .as_ref()
1109                .unwrap()
1110                .dim,
1111            1536
1112        );
1113    }
1114
1115    #[test]
1116    fn test_dense_vector_with_centroids() {
1117        let sdl = r#"
1118            index documents {
1119                field embedding: dense_vector<768> [indexed<centroids: "centroids.bin">, stored]
1120            }
1121        "#;
1122
1123        let indexes = parse_sdl(sdl).unwrap();
1124        assert_eq!(indexes.len(), 1);
1125
1126        let f = &indexes[0].fields[0];
1127        assert_eq!(f.name, "embedding");
1128        assert_eq!(f.field_type, FieldType::DenseVector);
1129
1130        let config = f.dense_vector_config.as_ref().unwrap();
1131        assert_eq!(config.dim, 768);
1132        assert_eq!(
1133            config.coarse_centroids_path.as_deref(),
1134            Some("centroids.bin")
1135        );
1136        assert_eq!(config.nprobe, 32); // default
1137    }
1138
1139    #[test]
1140    fn test_dense_vector_with_centroids_and_nprobe() {
1141        let sdl = r#"
1142            index documents {
1143                field embedding: dense_vector<1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1144            }
1145        "#;
1146
1147        let indexes = parse_sdl(sdl).unwrap();
1148        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1149
1150        assert_eq!(config.dim, 1536);
1151        assert_eq!(
1152            config.coarse_centroids_path.as_deref(),
1153            Some("/path/to/centroids.bin")
1154        );
1155        assert_eq!(config.nprobe, 64);
1156    }
1157
1158    #[test]
1159    fn test_dense_vector_keyword_syntax() {
1160        let sdl = r#"
1161            index documents {
1162                field embedding: dense_vector<dims: 1536> [indexed, stored]
1163            }
1164        "#;
1165
1166        let indexes = parse_sdl(sdl).unwrap();
1167        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1168
1169        assert_eq!(config.dim, 1536);
1170        assert!(config.coarse_centroids_path.is_none());
1171    }
1172
1173    #[test]
1174    fn test_dense_vector_keyword_syntax_full() {
1175        let sdl = r#"
1176            index documents {
1177                field embedding: dense_vector<dims: 1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1178            }
1179        "#;
1180
1181        let indexes = parse_sdl(sdl).unwrap();
1182        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1183
1184        assert_eq!(config.dim, 1536);
1185        assert_eq!(
1186            config.coarse_centroids_path.as_deref(),
1187            Some("/path/to/centroids.bin")
1188        );
1189        assert_eq!(config.nprobe, 64);
1190    }
1191
1192    #[test]
1193    fn test_dense_vector_keyword_syntax_partial() {
1194        let sdl = r#"
1195            index documents {
1196                field embedding: dense_vector<dims: 768> [indexed<centroids: "centroids.bin">]
1197            }
1198        "#;
1199
1200        let indexes = parse_sdl(sdl).unwrap();
1201        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1202
1203        assert_eq!(config.dim, 768);
1204        assert_eq!(
1205            config.coarse_centroids_path.as_deref(),
1206            Some("centroids.bin")
1207        );
1208        assert_eq!(config.nprobe, 32); // default
1209    }
1210
1211    #[test]
1212    fn test_dense_vector_scann_index() {
1213        use crate::dsl::schema::VectorIndexType;
1214
1215        let sdl = r#"
1216            index documents {
1217                field embedding: dense_vector<dims: 768> [indexed<scann, centroids: "centroids.bin", codebook: "pq_codebook.bin", nprobe: 64>]
1218            }
1219        "#;
1220
1221        let indexes = parse_sdl(sdl).unwrap();
1222        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1223
1224        assert_eq!(config.dim, 768);
1225        assert_eq!(config.index_type, VectorIndexType::ScaNN);
1226        assert_eq!(
1227            config.coarse_centroids_path.as_deref(),
1228            Some("centroids.bin")
1229        );
1230        assert_eq!(config.pq_codebook_path.as_deref(), Some("pq_codebook.bin"));
1231        assert_eq!(config.nprobe, 64);
1232    }
1233
1234    #[test]
1235    fn test_dense_vector_rabitq_index() {
1236        use crate::dsl::schema::VectorIndexType;
1237
1238        let sdl = r#"
1239            index documents {
1240                field embedding: dense_vector<dims: 1536> [indexed<rabitq, centroids: "centroids.bin">]
1241            }
1242        "#;
1243
1244        let indexes = parse_sdl(sdl).unwrap();
1245        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1246
1247        assert_eq!(config.dim, 1536);
1248        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1249        assert_eq!(
1250            config.coarse_centroids_path.as_deref(),
1251            Some("centroids.bin")
1252        );
1253        assert!(config.pq_codebook_path.is_none());
1254    }
1255
1256    #[test]
1257    fn test_dense_vector_rabitq_no_centroids() {
1258        use crate::dsl::schema::VectorIndexType;
1259
1260        let sdl = r#"
1261            index documents {
1262                field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1263            }
1264        "#;
1265
1266        let indexes = parse_sdl(sdl).unwrap();
1267        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1268
1269        assert_eq!(config.dim, 768);
1270        assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1271        assert!(config.coarse_centroids_path.is_none());
1272    }
1273
1274    #[test]
1275    fn test_dense_vector_default_index_type() {
1276        use crate::dsl::schema::VectorIndexType;
1277
1278        // When no index type specified, should default to RaBitQ (basic)
1279        let sdl = r#"
1280            index documents {
1281                field embedding: dense_vector<dims: 768> [indexed]
1282            }
1283        "#;
1284
1285        let indexes = parse_sdl(sdl).unwrap();
1286        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1287
1288        assert_eq!(config.dim, 768);
1289        assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1290    }
1291
1292    #[test]
1293    fn test_dense_vector_mrl_dim() {
1294        // Test matryoshka/MRL dimension trimming (new syntax: mrl_dim in indexed<...>)
1295        let sdl = r#"
1296            index documents {
1297                field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1298            }
1299        "#;
1300
1301        let indexes = parse_sdl(sdl).unwrap();
1302        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1303
1304        assert_eq!(config.dim, 1536);
1305        assert_eq!(config.mrl_dim, Some(256));
1306        assert_eq!(config.index_dim(), 256);
1307    }
1308
1309    #[test]
1310    fn test_dense_vector_mrl_dim_with_centroids() {
1311        // Test mrl_dim combined with other index options
1312        let sdl = r#"
1313            index documents {
1314                field embedding: dense_vector<768> [indexed<centroids: "centroids.bin", nprobe: 64, mrl_dim: 128>]
1315            }
1316        "#;
1317
1318        let indexes = parse_sdl(sdl).unwrap();
1319        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1320
1321        assert_eq!(config.dim, 768);
1322        assert_eq!(config.mrl_dim, Some(128));
1323        assert_eq!(config.index_dim(), 128);
1324        assert_eq!(
1325            config.coarse_centroids_path.as_deref(),
1326            Some("centroids.bin")
1327        );
1328        assert_eq!(config.nprobe, 64);
1329    }
1330
1331    #[test]
1332    fn test_dense_vector_no_mrl_dim() {
1333        // Test that index_dim() returns full dim when mrl_dim is not set
1334        let sdl = r#"
1335            index documents {
1336                field embedding: dense_vector<dims: 768> [indexed]
1337            }
1338        "#;
1339
1340        let indexes = parse_sdl(sdl).unwrap();
1341        let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1342
1343        assert_eq!(config.dim, 768);
1344        assert_eq!(config.mrl_dim, None);
1345        assert_eq!(config.index_dim(), 768);
1346    }
1347
1348    #[test]
1349    fn test_json_field_type() {
1350        let sdl = r#"
1351            index documents {
1352                field title: text [indexed, stored]
1353                field metadata: json [stored]
1354                field extra: json
1355            }
1356        "#;
1357
1358        let indexes = parse_sdl(sdl).unwrap();
1359        let index = &indexes[0];
1360
1361        assert_eq!(index.fields.len(), 3);
1362
1363        // Check JSON field
1364        assert_eq!(index.fields[1].name, "metadata");
1365        assert!(matches!(index.fields[1].field_type, FieldType::Json));
1366        assert!(index.fields[1].stored);
1367        // JSON fields should not be indexed (enforced by add_json_field)
1368
1369        // Check default attributes for JSON field
1370        assert_eq!(index.fields[2].name, "extra");
1371        assert!(matches!(index.fields[2].field_type, FieldType::Json));
1372
1373        // Verify schema conversion
1374        let schema = index.to_schema();
1375        let metadata_field = schema.get_field("metadata").unwrap();
1376        let entry = schema.get_field_entry(metadata_field).unwrap();
1377        assert_eq!(entry.field_type, FieldType::Json);
1378        assert!(!entry.indexed); // JSON fields are never indexed
1379        assert!(entry.stored);
1380    }
1381}