laurus 0.9.0

Unified search library for lexical, vector, and semantic retrieval
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
pub mod analyzer;
pub mod embedder;

use serde::{Deserialize, Serialize};
use std::collections::HashMap;

use self::analyzer::AnalyzerDefinition;
use self::embedder::EmbedderDefinition;

use crate::lexical::core::field::{
    BooleanOption, BytesOption, DateTimeOption, FloatOption, Geo3dOption, GeoOption, IntegerOption,
    TextOption,
};
use crate::vector::core::field::{FlatOption, HnswOption, IvfOption};

/// Policy for fields that are not declared in the schema.
///
/// Applied when a document is ingested with field names that do not appear in
/// [`Schema::fields`]. The default is [`DynamicFieldPolicy::Dynamic`], which
/// mirrors the "schema-less onboarding" design goal: users can start indexing
/// immediately without defining a schema upfront.
///
/// # Variants
///
/// - [`Strict`](Self::Strict): Unknown fields cause the ingest to fail. Use
///   when you want to enforce an exact schema contract.
/// - [`Dynamic`](Self::Dynamic) (default): Unknown fields are accepted; their
///   type is inferred from the value and a new field definition is added to
///   the schema automatically.
/// - [`Ignore`](Self::Ignore): Unknown fields are silently dropped. Use when
///   you want to ingest partially-structured data without rejecting it but
///   also without expanding the schema.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
pub enum DynamicFieldPolicy {
    /// Fail the ingest when any field is not declared in the schema.
    Strict,
    /// Infer the type of unknown fields and add them to the schema.
    #[default]
    Dynamic,
    /// Silently drop unknown fields.
    Ignore,
}

impl std::str::FromStr for DynamicFieldPolicy {
    type Err = crate::error::LaurusError;

    /// Parse a policy name (case-insensitive).
    ///
    /// Accepted values: `"strict"`, `"dynamic"`, `"ignore"`. This is the
    /// canonical policy parser used by all language bindings so the accepted
    /// spelling is identical across Python, Node.js, WASM, Ruby, and PHP.
    ///
    /// # Errors
    ///
    /// Returns [`crate::error::LaurusError::invalid_argument`] for any
    /// unrecognised value.
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.trim().to_ascii_lowercase().as_str() {
            "strict" => Ok(DynamicFieldPolicy::Strict),
            "dynamic" => Ok(DynamicFieldPolicy::Dynamic),
            "ignore" => Ok(DynamicFieldPolicy::Ignore),
            other => Err(crate::error::LaurusError::invalid_argument(format!(
                "unknown dynamic field policy '{other}' \
                 (expected 'strict', 'dynamic', or 'ignore')"
            ))),
        }
    }
}

/// Name of the automatically-injected external document ID field.
///
/// This is the sole field name with a `_` prefix that the engine accepts from
/// user code; all other `_`-prefixed names are rejected by
/// [`validate_field_name`].
pub const RESERVED_ID_FIELD: &str = "_id";

/// Returns `true` if `name` is a reserved field name that user code is
/// allowed to reference explicitly (currently only [`RESERVED_ID_FIELD`]).
///
/// # Arguments
///
/// * `name` - The field name to check.
pub fn is_allowed_reserved_field(name: &str) -> bool {
    name == RESERVED_ID_FIELD
}

/// Validates that a user-supplied field name does not collide with the
/// engine's reserved namespace.
///
/// Field names whose first character is `_` are reserved for the engine
/// (e.g. [`RESERVED_ID_FIELD`]) and cannot be declared by users. The only
/// exception is the allow-listed names returned by
/// [`is_allowed_reserved_field`].
///
/// # Arguments
///
/// * `name` - The field name to validate.
///
/// # Errors
///
/// Returns [`crate::error::LaurusError::invalid_argument`] if the name starts
/// with `_` and is not in the allow-list.
pub fn validate_field_name(name: &str) -> crate::error::Result<()> {
    if name.starts_with('_') && !is_allowed_reserved_field(name) {
        return Err(crate::error::LaurusError::invalid_argument(format!(
            "Field name '{name}' is reserved: names starting with '_' are \
             reserved for system fields (allowed: '{RESERVED_ID_FIELD}')"
        )));
    }
    Ok(())
}

/// Schema for the unified engine.
///
/// Declares what fields exist, their index types (lexical or vector),
/// and optional custom analyzer definitions. Custom analyzers are
/// referenced by name from [`TextOption::analyzer`].
///
/// The schema also carries a [`DynamicFieldPolicy`] that controls how
/// undeclared fields are handled during document ingestion. The default is
/// [`DynamicFieldPolicy::Dynamic`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Schema {
    /// Custom analyzer definitions, keyed by name.
    /// These can be referenced from text field `analyzer` settings.
    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
    pub analyzers: HashMap<String, AnalyzerDefinition>,
    /// Embedder definitions, keyed by name.
    /// These can be referenced from vector field `embedder` settings.
    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
    pub embedders: HashMap<String, EmbedderDefinition>,
    /// Options for each field.
    pub fields: HashMap<String, FieldOption>,
    /// Default fields for search.
    #[serde(default)]
    pub default_fields: Vec<String>,
    /// Policy for fields not declared in [`fields`](Self::fields).
    /// Defaults to [`DynamicFieldPolicy::Dynamic`].
    #[serde(default)]
    pub dynamic_field_policy: DynamicFieldPolicy,
}

impl Schema {
    pub fn new() -> Self {
        Self {
            analyzers: HashMap::new(),
            embedders: HashMap::new(),
            fields: HashMap::new(),
            default_fields: Vec::new(),
            dynamic_field_policy: DynamicFieldPolicy::default(),
        }
    }

    pub fn builder() -> SchemaBuilder {
        SchemaBuilder::default()
    }
}

impl Default for Schema {
    fn default() -> Self {
        Self::new()
    }
}

/// Options for a single field in the unified schema.
///
/// Each variant directly represents a concrete field type.
/// For hybrid search, define separate fields for vector and lexical indexing.
///
/// Serializes using serde's externally tagged representation:
/// ```json
/// { "Text": { "indexed": true, "stored": true, "term_vectors": false } }
/// { "Hnsw": { "dimension": 384, "distance": "Cosine" } }
/// ```
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum FieldOption {
    /// Text field options (lexical search).
    Text(TextOption),
    /// Integer field options.
    Integer(IntegerOption),
    /// Float field options.
    Float(FloatOption),
    /// Boolean field options.
    Boolean(BooleanOption),
    /// DateTime field options.
    DateTime(DateTimeOption),
    /// 2D geo field options.
    Geo(GeoOption),
    /// 3D ECEF geo field options.
    Geo3d(Geo3dOption),
    /// Bytes field options.
    Bytes(BytesOption),
    /// HNSW vector index options.
    Hnsw(HnswOption),
    /// Flat vector index options.
    Flat(FlatOption),
    /// IVF vector index options.
    Ivf(IvfOption),
}

impl FieldOption {
    /// Returns true if this is a vector field.
    pub fn is_vector(&self) -> bool {
        matches!(self, Self::Hnsw(_) | Self::Flat(_) | Self::Ivf(_))
    }

    /// Returns true if this is a lexical field.
    pub fn is_lexical(&self) -> bool {
        matches!(
            self,
            Self::Text(_)
                | Self::Integer(_)
                | Self::Float(_)
                | Self::Boolean(_)
                | Self::DateTime(_)
                | Self::Geo(_)
                | Self::Geo3d(_)
                | Self::Bytes(_)
        )
    }

    /// Converts to the vector-subsystem's `FieldOption` if this is a vector field.
    pub fn to_vector(&self) -> Option<crate::vector::core::field::FieldOption> {
        match self {
            Self::Hnsw(o) => Some(crate::vector::core::field::FieldOption::Hnsw(o.clone())),
            Self::Flat(o) => Some(crate::vector::core::field::FieldOption::Flat(o.clone())),
            Self::Ivf(o) => Some(crate::vector::core::field::FieldOption::Ivf(o.clone())),
            _ => None,
        }
    }

    /// Returns the embedder name if this is a vector field with an embedder configured.
    pub fn embedder_name(&self) -> Option<&str> {
        match self {
            Self::Hnsw(o) => o.embedder.as_deref(),
            Self::Flat(o) => o.embedder.as_deref(),
            Self::Ivf(o) => o.embedder.as_deref(),
            _ => None,
        }
    }

    /// Converts to the lexical-subsystem's `FieldOption` if this is a lexical field.
    pub fn to_lexical(&self) -> Option<crate::lexical::core::field::FieldOption> {
        match self {
            Self::Text(o) => Some(crate::lexical::core::field::FieldOption::Text(o.clone())),
            Self::Integer(o) => Some(crate::lexical::core::field::FieldOption::Integer(o.clone())),
            Self::Float(o) => Some(crate::lexical::core::field::FieldOption::Float(o.clone())),
            Self::Boolean(o) => Some(crate::lexical::core::field::FieldOption::Boolean(o.clone())),
            Self::DateTime(o) => Some(crate::lexical::core::field::FieldOption::DateTime(
                o.clone(),
            )),
            Self::Geo(o) => Some(crate::lexical::core::field::FieldOption::Geo(o.clone())),
            Self::Geo3d(o) => Some(crate::lexical::core::field::FieldOption::Geo3d(o.clone())),
            Self::Bytes(o) => Some(crate::lexical::core::field::FieldOption::Bytes(o.clone())),
            _ => None,
        }
    }
}

#[derive(Default)]
pub struct SchemaBuilder {
    analyzers: HashMap<String, AnalyzerDefinition>,
    embedders: HashMap<String, EmbedderDefinition>,
    fields: HashMap<String, FieldOption>,
    default_fields: Vec<String>,
    dynamic_field_policy: DynamicFieldPolicy,
}

impl SchemaBuilder {
    pub fn add_field(mut self, name: impl Into<String>, option: FieldOption) -> Self {
        let name = name.into();
        self.fields.insert(name, option);
        self
    }

    pub fn add_text_field(self, name: impl Into<String>, option: impl Into<TextOption>) -> Self {
        self.add_field(name, FieldOption::Text(option.into()))
    }

    pub fn add_integer_field(
        self,
        name: impl Into<String>,
        option: impl Into<IntegerOption>,
    ) -> Self {
        self.add_field(name, FieldOption::Integer(option.into()))
    }

    pub fn add_float_field(self, name: impl Into<String>, option: impl Into<FloatOption>) -> Self {
        self.add_field(name, FieldOption::Float(option.into()))
    }

    pub fn add_boolean_field(
        self,
        name: impl Into<String>,
        option: impl Into<BooleanOption>,
    ) -> Self {
        self.add_field(name, FieldOption::Boolean(option.into()))
    }

    pub fn add_datetime_field(
        self,
        name: impl Into<String>,
        option: impl Into<DateTimeOption>,
    ) -> Self {
        self.add_field(name, FieldOption::DateTime(option.into()))
    }

    pub fn add_geo_field(self, name: impl Into<String>, option: impl Into<GeoOption>) -> Self {
        self.add_field(name, FieldOption::Geo(option.into()))
    }

    /// Add a 3D ECEF geo field, indexed in a 3D BKD tree for sphere /
    /// k-NN queries (queries themselves arrive with #300–#302).
    pub fn add_geo3d_field(self, name: impl Into<String>, option: impl Into<Geo3dOption>) -> Self {
        self.add_field(name, FieldOption::Geo3d(option.into()))
    }

    pub fn add_bytes_field(self, name: impl Into<String>, option: impl Into<BytesOption>) -> Self {
        self.add_field(name, FieldOption::Bytes(option.into()))
    }

    pub fn add_hnsw_field(self, name: impl Into<String>, option: impl Into<HnswOption>) -> Self {
        self.add_field(name, FieldOption::Hnsw(option.into()))
    }

    pub fn add_flat_field(self, name: impl Into<String>, option: impl Into<FlatOption>) -> Self {
        self.add_field(name, FieldOption::Flat(option.into()))
    }

    pub fn add_ivf_field(self, name: impl Into<String>, option: impl Into<IvfOption>) -> Self {
        self.add_field(name, FieldOption::Ivf(option.into()))
    }

    pub fn add_default_field(mut self, name: impl Into<String>) -> Self {
        let name = name.into();
        self.default_fields.push(name);
        self
    }

    /// Add a custom analyzer definition to the schema.
    ///
    /// # Arguments
    ///
    /// * `name` - The analyzer name (referenced from `TextOption::analyzer`).
    /// * `definition` - The analyzer definition.
    pub fn add_analyzer(mut self, name: impl Into<String>, definition: AnalyzerDefinition) -> Self {
        self.analyzers.insert(name.into(), definition);
        self
    }

    /// Add an embedder definition to the schema.
    ///
    /// # Arguments
    ///
    /// * `name` - The embedder name (referenced from vector field `embedder`).
    /// * `definition` - The embedder definition.
    pub fn add_embedder(mut self, name: impl Into<String>, definition: EmbedderDefinition) -> Self {
        self.embedders.insert(name.into(), definition);
        self
    }

    /// Sets the policy for fields not declared in the schema.
    ///
    /// See [`DynamicFieldPolicy`] for the available options. When not set,
    /// the default is [`DynamicFieldPolicy::Dynamic`].
    ///
    /// # Arguments
    ///
    /// * `policy` - The dynamic field policy to apply during ingestion.
    pub fn dynamic_field_policy(mut self, policy: DynamicFieldPolicy) -> Self {
        self.dynamic_field_policy = policy;
        self
    }

    /// Build the schema, validating reserved field names.
    ///
    /// # Errors
    ///
    /// Returns an error if any field name starts with `_` and is not in the
    /// reserved allow-list (see [`validate_field_name`]).
    pub fn try_build(self) -> crate::error::Result<Schema> {
        for name in self.fields.keys() {
            validate_field_name(name)?;
        }
        Ok(Schema {
            analyzers: self.analyzers,
            embedders: self.embedders,
            fields: self.fields,
            default_fields: self.default_fields,
            dynamic_field_policy: self.dynamic_field_policy,
        })
    }

    /// Build the schema.
    ///
    /// # Panics
    ///
    /// Panics if any field name collides with a reserved name. Use
    /// [`try_build`](Self::try_build) for a fallible variant.
    pub fn build(self) -> Schema {
        self.try_build()
            .expect("SchemaBuilder::build: field name validation failed")
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::lexical::core::field::TextOption;

    #[test]
    fn default_dynamic_field_policy_is_dynamic() {
        assert_eq!(DynamicFieldPolicy::default(), DynamicFieldPolicy::Dynamic);
    }

    #[test]
    fn schema_new_uses_default_policy() {
        let schema = Schema::new();
        assert_eq!(schema.dynamic_field_policy, DynamicFieldPolicy::Dynamic);
    }

    #[test]
    fn schema_builder_sets_policy() {
        let schema = Schema::builder()
            .dynamic_field_policy(DynamicFieldPolicy::Strict)
            .build();
        assert_eq!(schema.dynamic_field_policy, DynamicFieldPolicy::Strict);
    }

    #[test]
    fn validate_field_name_accepts_regular_name() {
        assert!(validate_field_name("title").is_ok());
        assert!(validate_field_name("year_2024").is_ok());
        assert!(validate_field_name("a").is_ok());
    }

    #[test]
    fn validate_field_name_accepts_id() {
        assert!(validate_field_name(RESERVED_ID_FIELD).is_ok());
    }

    #[test]
    fn validate_field_name_rejects_underscore_prefix() {
        let err = validate_field_name("_score").unwrap_err();
        assert!(
            err.to_string().contains("reserved"),
            "unexpected error: {err}"
        );
        assert!(validate_field_name("_custom").is_err());
        assert!(validate_field_name("__foo").is_err());
    }

    #[test]
    fn schema_builder_try_build_rejects_reserved_name() {
        let result = Schema::builder()
            .add_field("_bad", FieldOption::Text(TextOption::default()))
            .try_build();
        assert!(result.is_err());
    }

    #[test]
    fn schema_builder_try_build_accepts_regular_names() {
        let result = Schema::builder()
            .add_field("title", FieldOption::Text(TextOption::default()))
            .try_build();
        assert!(result.is_ok());
    }

    #[test]
    fn schema_builder_add_geo3d_field_round_trips() {
        let schema = Schema::builder()
            .add_geo3d_field("position", Geo3dOption::default())
            .build();
        let opt = schema.fields.get("position").expect("field declared");
        match opt {
            FieldOption::Geo3d(g3d) => {
                assert!(g3d.indexed);
                assert!(g3d.stored);
            }
            other => panic!("expected FieldOption::Geo3d, got {other:?}"),
        }

        // The engine schema -> lexical schema bridge must preserve Geo3d.
        let lexical = opt.to_lexical().expect("Geo3d is a lexical field");
        assert!(matches!(
            lexical,
            crate::lexical::core::field::FieldOption::Geo3d(_)
        ));
        assert!(opt.is_lexical());
        assert!(!opt.is_vector());
    }

    #[test]
    fn dynamic_field_policy_serde_round_trip() {
        for policy in [
            DynamicFieldPolicy::Strict,
            DynamicFieldPolicy::Dynamic,
            DynamicFieldPolicy::Ignore,
        ] {
            let json = serde_json::to_string(&policy).unwrap();
            let back: DynamicFieldPolicy = serde_json::from_str(&json).unwrap();
            assert_eq!(policy, back);
        }
    }
}