laurus 0.9.0

Unified search library for lexical, vector, and semantic retrieval
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
use std::collections::HashMap;

use chrono::{DateTime, Utc};
use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
use serde::{Deserialize, Serialize};

use crate::error::LaurusError;

/// Helper for archiving DateTime as micros timestamp (i64)
pub struct MicroSeconds;

impl rkyv::with::ArchiveWith<DateTime<Utc>> for MicroSeconds {
    type Archived = rkyv::Archived<i64>;
    type Resolver = ();

    fn resolve_with(field: &DateTime<Utc>, _: (), out: rkyv::Place<Self::Archived>) {
        let ts = field.timestamp_micros();
        ts.resolve((), out);
    }
}

impl<S: rkyv::rancor::Fallible + ?Sized> rkyv::with::SerializeWith<DateTime<Utc>, S>
    for MicroSeconds
{
    fn serialize_with(
        field: &DateTime<Utc>,
        serializer: &mut S,
    ) -> Result<Self::Resolver, S::Error> {
        RkyvSerialize::serialize(&field.timestamp_micros(), serializer)
    }
}

impl<D: rkyv::rancor::Fallible + ?Sized>
    rkyv::with::DeserializeWith<rkyv::Archived<i64>, DateTime<Utc>, D> for MicroSeconds
{
    fn deserialize_with(
        archived: &rkyv::Archived<i64>,
        _deserializer: &mut D,
    ) -> Result<DateTime<Utc>, D::Error> {
        let ts: i64 = (*archived).into();
        // DateTime::from_timestamp_micros returns None only for out-of-range values.
        // Fall back to UNIX epoch for corrupted timestamps to avoid panics.
        Ok(DateTime::from_timestamp_micros(ts).unwrap_or_default())
    }
}

/// A geographical point on the Earth's surface, in WGS84 latitude /
/// longitude degrees.
///
/// `lat` is bounded to `[-90, 90]` and `lon` to `[-180, 180]`. Use
/// [`GeoPoint::try_new`] to validate at construction; the infallible
/// [`GeoPoint::new`] only debug-asserts and is intended for callers that
/// have already validated their input (or for hot paths inside the engine
/// where validation has happened upstream).
#[derive(
    Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize,
)]
pub struct GeoPoint {
    /// Latitude in degrees, bounded to `[-90, 90]`.
    pub lat: f64,
    /// Longitude in degrees, bounded to `[-180, 180]`.
    pub lon: f64,
}

impl GeoPoint {
    /// Construct a `GeoPoint` without validation.
    ///
    /// Debug builds assert that `lat` and `lon` are inside their valid
    /// ranges; release builds skip the check, on the assumption that the
    /// caller validated the values upstream (or constructed them from a
    /// trusted source like a previous serialized index). Prefer
    /// [`GeoPoint::try_new`] when handling user-supplied input.
    #[inline]
    pub fn new(lat: f64, lon: f64) -> Self {
        debug_assert!(
            (-90.0..=90.0).contains(&lat),
            "GeoPoint latitude {lat} out of range [-90, 90]"
        );
        debug_assert!(
            (-180.0..=180.0).contains(&lon),
            "GeoPoint longitude {lon} out of range [-180, 180]"
        );
        GeoPoint { lat, lon }
    }

    /// Construct a `GeoPoint`, validating that `lat` and `lon` lie inside
    /// their canonical WGS84 ranges. Returns `LaurusError::other` for
    /// out-of-range or `NaN` inputs.
    pub fn try_new(lat: f64, lon: f64) -> crate::error::Result<Self> {
        if !(-90.0..=90.0).contains(&lat) {
            return Err(LaurusError::other(format!(
                "Invalid latitude: {lat} (must be between -90 and 90)"
            )));
        }
        if !(-180.0..=180.0).contains(&lon) {
            return Err(LaurusError::other(format!(
                "Invalid longitude: {lon} (must be between -180 and 180)"
            )));
        }
        Ok(GeoPoint { lat, lon })
    }

    /// Great-circle distance to another point in meters, using the
    /// Haversine formula on a sphere of mean Earth radius (6 371 000 m).
    pub fn distance_to(&self, other: &GeoPoint) -> f64 {
        const EARTH_RADIUS_M: f64 = 6_371_000.0;

        let lat1_rad = self.lat.to_radians();
        let lat2_rad = other.lat.to_radians();
        let delta_lat = (other.lat - self.lat).to_radians();
        let delta_lon = (other.lon - self.lon).to_radians();

        let a = (delta_lat / 2.0).sin().powi(2)
            + lat1_rad.cos() * lat2_rad.cos() * (delta_lon / 2.0).sin().powi(2);
        let c = 2.0 * a.sqrt().atan2((1.0 - a).sqrt());

        EARTH_RADIUS_M * c
    }

    /// Initial bearing toward `other`, in degrees in `[0, 360)` clockwise
    /// from true north.
    pub fn bearing_to(&self, other: &GeoPoint) -> f64 {
        let lat1_rad = self.lat.to_radians();
        let lat2_rad = other.lat.to_radians();
        let delta_lon = (other.lon - self.lon).to_radians();

        let y = delta_lon.sin() * lat2_rad.cos();
        let x = lat1_rad.cos() * lat2_rad.sin() - lat1_rad.sin() * lat2_rad.cos() * delta_lon.cos();

        let bearing_rad = y.atan2(x);
        (bearing_rad.to_degrees() + 360.0) % 360.0
    }

    /// Whether this point falls inside the closed lat/lon rectangle
    /// `[min_lat, max_lat] × [min_lon, max_lon]`.
    pub fn within_bounds(&self, min_lat: f64, max_lat: f64, min_lon: f64, max_lon: f64) -> bool {
        self.lat >= min_lat && self.lat <= max_lat && self.lon >= min_lon && self.lon <= max_lon
    }
}

/// A 3D point in Earth-Centered Earth-Fixed (ECEF) Cartesian coordinates.
///
/// All three components are in meters, with the origin at the Earth's
/// center of mass. Suitable for true 3D geospatial queries (drone
/// proximity, satellite tracking, indoor 3D positioning) where a 2D
/// `GeoPoint` would lose the altitude dimension or wrap incorrectly near
/// the poles.
#[derive(
    Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize,
)]
pub struct GeoEcefPoint {
    /// X axis: meters along the equatorial plane through 0° longitude.
    pub x: f64,
    /// Y axis: meters along the equatorial plane through 90°E longitude.
    pub y: f64,
    /// Z axis: meters along the Earth's rotation axis toward the North Pole.
    pub z: f64,
}

impl GeoEcefPoint {
    /// Construct a `GeoEcefPoint` from raw Cartesian components.
    ///
    /// No range validation is performed: ECEF coordinates have no fixed
    /// upper bound (a satellite is a valid 3D point well outside the
    /// surface), and the natural sentinels `±INFINITY` are accepted as
    /// "unbounded" markers in queries.
    #[inline]
    pub fn new(x: f64, y: f64, z: f64) -> Self {
        GeoEcefPoint { x, y, z }
    }
}

/// The unified value type for fields in a document.
///
/// This enum merges the concepts of `FieldValue` (from Lexical Index) and
/// `VectorValue` (from Vector Index).
#[derive(
    Debug, Clone, PartialEq, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize,
)]
pub enum DataValue {
    // --- Primitive Types ---
    Null,
    Bool(bool),
    Int64(i64),
    Float64(f64),

    // --- Complex / Searchable Types ---
    /// Text content. Whether this is tokenized or treated as a keyword
    /// is determined by the schema's [`FieldOption`](crate::lexical::core::field::FieldOption)
    /// and the configured [`Analyzer`](crate::analysis::analyzer::analyzer::Analyzer).
    Text(String),

    /// Binary content (image, audio, etc.) to be embedded.
    /// Contains the raw bytes and an optional MIME type.
    Bytes(Vec<u8>, Option<String>),

    /// Pre-computed vector.
    Vector(Vec<f32>),

    /// Date and time in UTC.
    DateTime(#[rkyv(with = MicroSeconds)] chrono::DateTime<chrono::Utc>),

    /// 2D geographical point (WGS84 latitude / longitude).
    Geo(GeoPoint),

    /// 3D Earth-Centered Earth-Fixed (ECEF) Cartesian point in meters.
    GeoEcef(GeoEcefPoint),

    /// Multi-valued 64-bit signed integers.
    ///
    /// Used by fields declared with
    /// [`IntegerOption::multi_valued`](crate::lexical::core::field::IntegerOption::multi_valued)
    /// set to `true`. Range queries match a document if **any** value in
    /// the array satisfies the predicate (Lucene-style "any match"
    /// semantics with constant scoring).
    Int64Array(Vec<i64>),

    /// Multi-valued 64-bit floating-point numbers.
    ///
    /// Used by fields declared with
    /// [`FloatOption::multi_valued`](crate::lexical::core::field::FloatOption::multi_valued)
    /// set to `true`. Range queries match a document if **any** value in
    /// the array satisfies the predicate (Lucene-style "any match"
    /// semantics with constant scoring).
    Float64Array(Vec<f64>),
}

impl DataValue {
    /// Returns the text value if this is a Text variant.
    pub fn as_text(&self) -> Option<&str> {
        match self {
            DataValue::Text(s) => Some(s),
            _ => None,
        }
    }

    /// Returns the integer value if this is an Int64 variant.
    pub fn as_integer(&self) -> Option<i64> {
        match self {
            DataValue::Int64(i) => Some(*i),
            _ => None,
        }
    }

    /// Returns the float value if this is a Float64 variant.
    pub fn as_float(&self) -> Option<f64> {
        match self {
            DataValue::Float64(f) => Some(*f),
            _ => None,
        }
    }

    /// Returns the boolean value if this is a Bool variant.
    pub fn as_boolean(&self) -> Option<bool> {
        match self {
            DataValue::Bool(b) => Some(*b),
            _ => None,
        }
    }

    /// Returns the datetime value if this is a DateTime variant.
    pub fn as_datetime(&self) -> Option<chrono::DateTime<chrono::Utc>> {
        match self {
            DataValue::DateTime(dt) => Some(*dt),
            _ => None,
        }
    }

    /// Returns the vector data if this is a Vector variant.
    pub fn as_vector(&self) -> Option<&Vec<f32>> {
        match self {
            DataValue::Vector(v) => Some(v),
            _ => None,
        }
    }

    /// Returns the bytes data if this is a Bytes variant.
    pub fn as_bytes(&self) -> Option<&[u8]> {
        match self {
            DataValue::Bytes(b, _) => Some(b),
            _ => None,
        }
    }

    /// Returns the geographical point if this is a `Geo` variant.
    pub fn as_geo(&self) -> Option<GeoPoint> {
        match self {
            DataValue::Geo(p) => Some(*p),
            _ => None,
        }
    }

    /// Returns the ECEF Cartesian point if this is a `GeoEcef` variant.
    pub fn as_geo_ecef(&self) -> Option<GeoEcefPoint> {
        match self {
            DataValue::GeoEcef(p) => Some(*p),
            _ => None,
        }
    }

    /// Returns the multi-valued integer slice if this is an `Int64Array` variant.
    pub fn as_int64_array(&self) -> Option<&[i64]> {
        match self {
            DataValue::Int64Array(arr) => Some(arr),
            _ => None,
        }
    }

    /// Returns the multi-valued float slice if this is a `Float64Array` variant.
    pub fn as_float64_array(&self) -> Option<&[f64]> {
        match self {
            DataValue::Float64Array(arr) => Some(arr),
            _ => None,
        }
    }
}

// --- Conversions ---

impl From<String> for DataValue {
    fn from(v: String) -> Self {
        DataValue::Text(v)
    }
}

impl From<&str> for DataValue {
    fn from(v: &str) -> Self {
        DataValue::Text(v.to_string())
    }
}

impl From<i64> for DataValue {
    fn from(v: i64) -> Self {
        DataValue::Int64(v)
    }
}

impl From<i32> for DataValue {
    fn from(v: i32) -> Self {
        DataValue::Int64(v as i64)
    }
}

impl From<f64> for DataValue {
    fn from(v: f64) -> Self {
        DataValue::Float64(v)
    }
}

impl From<f32> for DataValue {
    fn from(v: f32) -> Self {
        DataValue::Float64(v as f64)
    }
}

impl From<bool> for DataValue {
    fn from(v: bool) -> Self {
        DataValue::Bool(v)
    }
}

impl From<chrono::DateTime<chrono::Utc>> for DataValue {
    fn from(dt: chrono::DateTime<chrono::Utc>) -> Self {
        DataValue::DateTime(dt)
    }
}

impl From<Vec<f32>> for DataValue {
    fn from(v: Vec<f32>) -> Self {
        DataValue::Vector(v)
    }
}

impl From<Vec<i64>> for DataValue {
    fn from(v: Vec<i64>) -> Self {
        DataValue::Int64Array(v)
    }
}

impl From<Vec<f64>> for DataValue {
    fn from(v: Vec<f64>) -> Self {
        DataValue::Float64Array(v)
    }
}

impl From<GeoPoint> for DataValue {
    fn from(p: GeoPoint) -> Self {
        DataValue::Geo(p)
    }
}

impl From<GeoEcefPoint> for DataValue {
    fn from(p: GeoEcefPoint) -> Self {
        DataValue::GeoEcef(p)
    }
}

/// Unified Document structure.
///
/// A document is a pure data container — a collection of named fields,
/// each containing a [`DataValue`]. Document identity (external ID) is
/// managed by the [`Engine`](crate::Engine), not by the document itself.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Document {
    /// Field data.
    pub fields: HashMap<String, DataValue>,
}

impl Document {
    /// Create a new empty document.
    pub fn new() -> Self {
        Self {
            fields: HashMap::new(),
        }
    }

    /// Get a reference to a field's value.
    pub fn get(&self, name: &str) -> Option<&DataValue> {
        self.fields.get(name)
    }

    /// Alias for [`get`](Self::get), provided for API consistency with field-based
    /// access patterns. Behaves identically to `get`.
    pub fn get_field(&self, name: &str) -> Option<&DataValue> {
        self.get(name)
    }

    /// Check if the document has a field.
    pub fn has_field(&self, name: &str) -> bool {
        self.fields.contains_key(name)
    }

    /// Get all field names.
    pub fn field_names(&self) -> Vec<&str> {
        self.fields.keys().map(|s| s.as_str()).collect()
    }

    /// Get the number of fields.
    pub fn len(&self) -> usize {
        self.fields.len()
    }

    /// Check if the document is empty.
    pub fn is_empty(&self) -> bool {
        self.fields.is_empty()
    }

    pub fn builder() -> DocumentBuilder {
        DocumentBuilder::default()
    }
}

impl Default for Document {
    fn default() -> Self {
        Self::new()
    }
}

#[derive(Default)]
pub struct DocumentBuilder {
    fields: HashMap<String, DataValue>,
}

impl DocumentBuilder {
    /// Add a field to the document.
    pub fn add_field(mut self, name: impl Into<String>, value: impl Into<DataValue>) -> Self {
        self.fields.insert(name.into(), value.into());
        self
    }

    /// Add a text field.
    pub fn add_text(self, name: impl Into<String>, text: impl Into<String>) -> Self {
        self.add_field(name.into(), DataValue::Text(text.into()))
    }

    /// Add an integer field.
    pub fn add_integer(self, name: impl Into<String>, value: i64) -> Self {
        self.add_field(name.into(), DataValue::Int64(value))
    }

    /// Add a float field.
    pub fn add_float(self, name: impl Into<String>, value: f64) -> Self {
        self.add_field(name.into(), DataValue::Float64(value))
    }

    /// Add a boolean field.
    pub fn add_boolean(self, name: impl Into<String>, value: bool) -> Self {
        self.add_field(name.into(), DataValue::Bool(value))
    }

    /// Add a datetime field.
    pub fn add_datetime(self, name: impl Into<String>, value: DateTime<Utc>) -> Self {
        self.add_field(name.into(), DataValue::DateTime(value))
    }

    /// Add a vector field.
    pub fn add_vector(self, name: impl Into<String>, vector: Vec<f32>) -> Self {
        self.add_field(name.into(), DataValue::Vector(vector))
    }

    /// Add a 2D geographical field from `(lat, lon)` degrees.
    ///
    /// The values are passed through [`GeoPoint::new`], which debug-asserts
    /// the canonical WGS84 ranges. Use [`add_field`](Self::add_field) with
    /// `GeoPoint::try_new(...)?` if you need validation against
    /// user-supplied input.
    pub fn add_geo(self, name: impl Into<String>, lat: f64, lon: f64) -> Self {
        self.add_field(name.into(), DataValue::Geo(GeoPoint::new(lat, lon)))
    }

    /// Add a 3D Earth-Centered Earth-Fixed (ECEF) Cartesian field from
    /// raw `(x, y, z)` meters.
    pub fn add_geo_ecef(self, name: impl Into<String>, x: f64, y: f64, z: f64) -> Self {
        self.add_field(name.into(), DataValue::GeoEcef(GeoEcefPoint::new(x, y, z)))
    }

    /// Add a multi-valued integer field.
    ///
    /// The schema field must be declared with
    /// [`IntegerOption::multi_valued`](crate::lexical::core::field::IntegerOption::multi_valued)
    /// set to `true`. Range queries match if any value satisfies the
    /// predicate.
    pub fn add_int64_array(self, name: impl Into<String>, values: Vec<i64>) -> Self {
        self.add_field(name.into(), DataValue::Int64Array(values))
    }

    /// Add a multi-valued float field.
    ///
    /// The schema field must be declared with
    /// [`FloatOption::multi_valued`](crate::lexical::core::field::FloatOption::multi_valued)
    /// set to `true`. Range queries match if any value satisfies the
    /// predicate.
    pub fn add_float64_array(self, name: impl Into<String>, values: Vec<f64>) -> Self {
        self.add_field(name.into(), DataValue::Float64Array(values))
    }

    /// Add a binary data field with no MIME type.
    ///
    /// The MIME type is set to `None`. If a MIME type is needed (e.g. for
    /// multimodal embedding), use [`add_field`](Self::add_field) directly with
    /// `DataValue::Bytes(data, Some(mime))`.
    pub fn add_bytes(self, name: impl Into<String>, data: Vec<u8>) -> Self {
        self.add_field(name.into(), DataValue::Bytes(data, None))
    }

    pub fn build(self) -> Document {
        Document {
            fields: self.fields,
        }
    }
}