milli-core 1.15.1

Meilisearch HTTP server
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
#![allow(clippy::type_complexity)]

#[cfg(not(windows))]
#[cfg(test)]
#[global_allocator]
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;

#[macro_use]
pub mod documents;

mod asc_desc;
mod attribute_patterns;
mod criterion;
pub mod database_stats;
pub mod disabled_typos_terms;
mod error;
mod external_documents_ids;
pub mod facet;
mod fields_ids_map;
mod filter_parser;
mod filterable_attributes_rules;
mod flatten_serde_json;
pub mod heed_codec;
pub mod index;
mod json_depth_checker;
mod localized_attributes_rules;
pub mod order_by_map;
pub mod prompt;
pub mod proximity;
pub mod score_details;
mod search;
mod thread_pool_no_abort;
pub mod update;
pub mod vector;

#[cfg(test)]
#[macro_use]
pub mod snapshot_tests;
pub mod constants;
mod fieldids_weights_map;
pub mod progress;

use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto};
use std::fmt;
use std::hash::BuildHasherDefault;

use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
pub use filter_parser::{Condition, FilterCondition, Span, Token};
use fxhash::{FxHasher32, FxHasher64};
pub use grenad::CompressionType;
pub use search::new::{
    execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext,
    SearchLogger, VisualSearchLogger,
};
use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, heed, rhai};

pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::attribute_patterns::AttributePatterns;
pub use self::attribute_patterns::PatternMatch;
pub use self::criterion::{default_criteria, Criterion, CriterionError};
pub use self::error::{
    Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
};
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fieldids_weights_map::FieldidsWeightsMap;
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
pub use self::filterable_attributes_rules::{
    FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns,
    FilterableAttributesRule,
};
pub use self::heed_codec::{
    BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
    RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
    UncheckedU8StrStrCodec,
};
pub use self::index::Index;
pub use self::localized_attributes_rules::LocalizedAttributesRule;
pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
pub use self::search::similar::Similar;
pub use self::search::{
    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
    Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
};
pub use self::update::ChannelCongestion;

pub use arroy;

pub type Result<T> = std::result::Result<T, error::Error>;

pub type Attribute = u32;
pub type BEU16 = heed::types::U16<heed::byteorder::BE>;
pub type BEU32 = heed::types::U32<heed::byteorder::BE>;
pub type BEU64 = heed::types::U64<heed::byteorder::BE>;
pub type DocumentId = u32;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
pub type FieldDistribution = BTreeMap<String, u64>;
pub type FieldId = u16;
pub type Weight = u16;
pub type Object = serde_json::Map<String, serde_json::Value>;
pub type Position = u32;
pub type RelativePosition = u16;
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
pub type Prefix = smallstr::SmallString<[u8; 16]>;
pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;

/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
/// expressed in term of latitude and longitude.
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;

/// The maximum length a LMDB key can be.
///
/// Note that the actual allowed length is a little bit higher, but
/// we keep a margin of safety.
const MAX_LMDB_KEY_LENGTH: usize = 500;

/// The maximum length a field value can be when inserted in an LMDB key.
///
/// This number is determined by the keys of the different facet databases
/// and adding a margin of safety.
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;

/// The maximum length a word can be
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;

pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;

#[derive(Clone)]
pub struct TimeBudget {
    started_at: std::time::Instant,
    budget: std::time::Duration,

    /// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
    /// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
    #[cfg(test)]
    stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
}

impl fmt::Debug for TimeBudget {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("TimeBudget")
            .field("started_at", &self.started_at)
            .field("budget", &self.budget)
            .field("left", &(self.budget - self.started_at.elapsed()))
            .finish()
    }
}

impl Default for TimeBudget {
    fn default() -> Self {
        Self::new(std::time::Duration::from_millis(1500))
    }
}

impl TimeBudget {
    pub fn new(budget: std::time::Duration) -> Self {
        Self {
            started_at: std::time::Instant::now(),
            budget,

            #[cfg(test)]
            stop_after: None,
        }
    }

    pub fn max() -> Self {
        Self::new(std::time::Duration::from_secs(u64::MAX))
    }

    #[cfg(test)]
    pub fn with_stop_after(mut self, stop_after: usize) -> Self {
        use std::sync::atomic::AtomicUsize;
        use std::sync::Arc;

        self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
        self
    }

    pub fn exceeded(&self) -> bool {
        #[cfg(test)]
        if let Some((current, stop_after)) = &self.stop_after {
            let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
            if current >= *stop_after {
                return true;
            } else {
                // if a number has been specified then we ignore entirely the time budget
                return false;
            }
        }

        self.started_at.elapsed() > self.budget
    }
}

// Convert an absolute word position into a relative position.
// Return the field id of the attribute related to the absolute position
// and the relative position in the attribute.
pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
    ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
}

// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
    ((field_id as u32) << 16) | (relative as u32)
}
// TODO: this is wrong, but will do for now
/// Compute the "bucketed" absolute position from the field id and relative position in the field.
///
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
pub fn bucketed_position(relative: u16) -> u16 {
    // The first few relative positions are kept intact.
    if relative < 16 {
        relative
    } else if relative < 24 {
        // Relative positions between 16 and 24 all become equal to 24
        24
    } else {
        // Then, groups of positions that have the same base-2 logarithm are reduced to
        // the same relative position: the smallest power of 2 that is greater than them
        (relative as f64).log2().ceil().exp2() as u16
    }
}

/// Transform a raw obkv store into a JSON Object.
pub fn obkv_to_json(
    displayed_fields: &[FieldId],
    fields_ids_map: &FieldsIdsMap,
    obkv: &obkv::KvReaderU16,
) -> Result<Object> {
    displayed_fields
        .iter()
        .copied()
        .flat_map(|id| obkv.get(id).map(|value| (id, value)))
        .map(|(id, value)| {
            let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId {
                field_id: id,
                process: "obkv_to_json",
            })?;
            let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?;
            Ok((name.to_owned(), value))
        })
        .collect()
}

/// Transform every field of a raw obkv store into a JSON Object.
pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result<Object> {
    let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
    obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv)
}

/// Transform a JSON value into a string that can be indexed.
pub fn json_to_string(value: &Value) -> Option<String> {
    fn inner(value: &Value, output: &mut String) -> bool {
        use std::fmt::Write;
        match value {
            Value::Null => false,
            Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
            Value::Number(number) => write!(output, "{}", number).is_ok(),
            Value::String(string) => write!(output, "{}", string).is_ok(),
            Value::Array(array) => {
                let mut count = 0;
                for value in array {
                    if inner(value, output) {
                        output.push_str(". ");
                        count += 1;
                    }
                }
                // check that at least one value was written
                count != 0
            }
            Value::Object(object) => {
                let mut buffer = String::new();
                let mut count = 0;
                for (key, value) in object {
                    buffer.clear();
                    let _ = write!(&mut buffer, "{}: ", key);
                    if inner(value, &mut buffer) {
                        buffer.push_str(". ");
                        // We write the "key: value. " pair only when
                        // we are sure that the value can be written.
                        output.push_str(&buffer);
                        count += 1;
                    }
                }
                // check that at least one value was written
                count != 0
            }
        }
    }

    let mut string = String::new();
    if inner(value, &mut string) {
        Some(string)
    } else {
        None
    }
}

/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
    if mid <= slice.len() {
        Some(slice.split_at(mid))
    } else {
        None
    }
}

/// Divides one slice into an array and the tail at an index,
/// returns `None` if `N` is out of bounds.
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
where
    [T; N]: for<'a> TryFrom<&'a [T]>,
{
    let (head, tail) = try_split_at(slice, N)?;
    let head = head.try_into().ok()?;
    Some((head, tail))
}

/// Return the distance between two points in meters. Each points are composed of two f64,
/// one latitude and one longitude.
pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 {
    let a = geoutils::Location::new(a[0], a[1]);
    let b = geoutils::Location::new(b[0], b[1]);

    a.haversine_distance_to(&b).meters()
}

/// Convert a point expressed in terms of latitude and longitude to a point in the
/// cartesian coordinate expressed in terms of x, y and z.
pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] {
    let [lat, lng] = coord.map(|f| f.to_radians());
    let x = lat.cos() * lng.cos();
    let y = lat.cos() * lng.sin();
    let z = lat.sin();

    [x, y, z]
}

/// Returns `true` if the field match one of the faceted fields.
/// See the function [`is_faceted_by`] below to see what “matching” means.
pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool {
    faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref()))
}

/// Returns `true` if the field match the facet.
/// ```
/// use milli_core::is_faceted_by;
/// // -- the valid basics
/// assert!(is_faceted_by("animaux", "animaux"));
/// assert!(is_faceted_by("animaux.chien", "animaux"));
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux"));
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien"));
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois"));
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure"));
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur"));
///
/// // -- the wrongs
/// assert!(!is_faceted_by("chien", "chat"));
/// assert!(!is_faceted_by("animaux", "animaux.chien"));
/// assert!(!is_faceted_by("animaux.chien", "animaux.chat"));
///
/// // -- the strange edge cases
/// assert!(!is_faceted_by("animaux.chien", "anima"));
/// assert!(!is_faceted_by("animaux.chien", "animau"));
/// assert!(!is_faceted_by("animaux.chien", "animaux."));
/// assert!(!is_faceted_by("animaux.chien", "animaux.c"));
/// assert!(!is_faceted_by("animaux.chien", "animaux.ch"));
/// assert!(!is_faceted_by("animaux.chien", "animaux.chi"));
/// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
/// ```
pub fn is_faceted_by(field: &str, facet: &str) -> bool {
    field.starts_with(facet) && field[facet.len()..].chars().next().is_none_or(|c| c == '.')
}

pub fn normalize_facet(original: &str) -> String {
    CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
}

#[cfg(test)]
mod tests {
    use serde_json::json;

    use super::*;

    #[test]
    fn json_to_string_object() {
        let value = json!({
            "name": "John Doe",
            "age": 43,
            "not_there": null,
        });

        let string = json_to_string(&value).unwrap();
        assert_eq!(string, "name: John Doe. age: 43. ");
    }

    #[test]
    fn json_to_string_array() {
        let value = json!([
            { "name": "John Doe" },
            43,
            "hello",
            [ "I", "am", "fine" ],
            null,
        ]);

        let string = json_to_string(&value).unwrap();
        // We don't care about having two point (.) after the other as
        // the distance of hard separators is clamped to 8 anyway.
        assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
    }

    #[test]
    fn test_relative_position_conversion() {
        assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
        assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
        assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
        assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
        assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
        assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
        assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
    }

    #[test]
    fn test_absolute_position_conversion() {
        assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
        assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
        assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
        assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
        assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
        assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
        assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
    }

    #[test]
    fn test_all_obkv_to_json() {
        let mut fields_ids_map = FieldsIdsMap::new();
        let id1 = fields_ids_map.insert("field1").unwrap();
        let id2 = fields_ids_map.insert("field2").unwrap();

        let mut writer = obkv::KvWriterU16::memory();
        writer.insert(id1, b"1234").unwrap();
        writer.insert(id2, b"4321").unwrap();
        let contents = writer.into_inner().unwrap();
        let obkv = obkv::KvReaderU16::from_slice(&contents);

        let expected = json!({
            "field1": 1234,
            "field2": 4321,
        });
        let expected = expected.as_object().unwrap();
        let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap();

        assert_eq!(&actual, expected);
    }
}