milli_core/
lib.rs

1#![allow(clippy::type_complexity)]
2
3#[cfg(not(windows))]
4#[cfg(test)]
5#[global_allocator]
6pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
7
8#[macro_use]
9pub mod documents;
10
11mod asc_desc;
12mod attribute_patterns;
13mod criterion;
14pub mod database_stats;
15pub mod disabled_typos_terms;
16mod error;
17mod external_documents_ids;
18pub mod facet;
19mod fields_ids_map;
20mod filter_parser;
21mod filterable_attributes_rules;
22mod flatten_serde_json;
23pub mod heed_codec;
24pub mod index;
25mod json_depth_checker;
26mod localized_attributes_rules;
27pub mod order_by_map;
28pub mod prompt;
29pub mod proximity;
30pub mod score_details;
31mod search;
32mod thread_pool_no_abort;
33pub mod update;
34pub mod vector;
35
36#[cfg(test)]
37#[macro_use]
38pub mod snapshot_tests;
39pub mod constants;
40mod fieldids_weights_map;
41pub mod progress;
42
43use std::collections::{BTreeMap, HashMap};
44use std::convert::{TryFrom, TryInto};
45use std::fmt;
46use std::hash::BuildHasherDefault;
47
48use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
49pub use filter_parser::{Condition, FilterCondition, Span, Token};
50use fxhash::{FxHasher32, FxHasher64};
51pub use grenad::CompressionType;
52pub use search::new::{
53    execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext,
54    SearchLogger, VisualSearchLogger,
55};
56use serde_json::Value;
57pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
58pub use {charabia as tokenizer, heed, rhai};
59
60pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
61pub use self::attribute_patterns::AttributePatterns;
62pub use self::attribute_patterns::PatternMatch;
63pub use self::criterion::{default_criteria, Criterion, CriterionError};
64pub use self::error::{
65    Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
66};
67pub use self::external_documents_ids::ExternalDocumentsIds;
68pub use self::fieldids_weights_map::FieldidsWeightsMap;
69pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
70pub use self::filterable_attributes_rules::{
71    FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns,
72    FilterableAttributesRule,
73};
74pub use self::heed_codec::{
75    BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
76    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
77    RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
78    UncheckedU8StrStrCodec,
79};
80pub use self::index::Index;
81pub use self::localized_attributes_rules::LocalizedAttributesRule;
82pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
83pub use self::search::similar::Similar;
84pub use self::search::{
85    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
86    Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
87};
88pub use self::update::ChannelCongestion;
89
90pub use arroy;
91
92pub type Result<T> = std::result::Result<T, error::Error>;
93
94pub type Attribute = u32;
95pub type BEU16 = heed::types::U16<heed::byteorder::BE>;
96pub type BEU32 = heed::types::U32<heed::byteorder::BE>;
97pub type BEU64 = heed::types::U64<heed::byteorder::BE>;
98pub type DocumentId = u32;
99pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
100pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
101pub type FieldDistribution = BTreeMap<String, u64>;
102pub type FieldId = u16;
103pub type Weight = u16;
104pub type Object = serde_json::Map<String, serde_json::Value>;
105pub type Position = u32;
106pub type RelativePosition = u16;
107pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
108pub type Prefix = smallstr::SmallString<[u8; 16]>;
109pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
110pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
111pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
112
113/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
114/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
115/// expressed in term of latitude and longitude.
116pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
117
118/// The maximum length a LMDB key can be.
119///
120/// Note that the actual allowed length is a little bit higher, but
121/// we keep a margin of safety.
122const MAX_LMDB_KEY_LENGTH: usize = 500;
123
124/// The maximum length a field value can be when inserted in an LMDB key.
125///
126/// This number is determined by the keys of the different facet databases
127/// and adding a margin of safety.
128pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;
129
130/// The maximum length a word can be
131pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
132
133pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
134
135#[derive(Clone)]
136pub struct TimeBudget {
137    started_at: std::time::Instant,
138    budget: std::time::Duration,
139
140    /// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
141    /// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
142    #[cfg(test)]
143    stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
144}
145
146impl fmt::Debug for TimeBudget {
147    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
148        f.debug_struct("TimeBudget")
149            .field("started_at", &self.started_at)
150            .field("budget", &self.budget)
151            .field("left", &(self.budget - self.started_at.elapsed()))
152            .finish()
153    }
154}
155
156impl Default for TimeBudget {
157    fn default() -> Self {
158        Self::new(std::time::Duration::from_millis(1500))
159    }
160}
161
162impl TimeBudget {
163    pub fn new(budget: std::time::Duration) -> Self {
164        Self {
165            started_at: std::time::Instant::now(),
166            budget,
167
168            #[cfg(test)]
169            stop_after: None,
170        }
171    }
172
173    pub fn max() -> Self {
174        Self::new(std::time::Duration::from_secs(u64::MAX))
175    }
176
177    #[cfg(test)]
178    pub fn with_stop_after(mut self, stop_after: usize) -> Self {
179        use std::sync::atomic::AtomicUsize;
180        use std::sync::Arc;
181
182        self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
183        self
184    }
185
186    pub fn exceeded(&self) -> bool {
187        #[cfg(test)]
188        if let Some((current, stop_after)) = &self.stop_after {
189            let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
190            if current >= *stop_after {
191                return true;
192            } else {
193                // if a number has been specified then we ignore entirely the time budget
194                return false;
195            }
196        }
197
198        self.started_at.elapsed() > self.budget
199    }
200}
201
202// Convert an absolute word position into a relative position.
203// Return the field id of the attribute related to the absolute position
204// and the relative position in the attribute.
205pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
206    ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
207}
208
209// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
210pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
211    ((field_id as u32) << 16) | (relative as u32)
212}
213// TODO: this is wrong, but will do for now
214/// Compute the "bucketed" absolute position from the field id and relative position in the field.
215///
216/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
217pub fn bucketed_position(relative: u16) -> u16 {
218    // The first few relative positions are kept intact.
219    if relative < 16 {
220        relative
221    } else if relative < 24 {
222        // Relative positions between 16 and 24 all become equal to 24
223        24
224    } else {
225        // Then, groups of positions that have the same base-2 logarithm are reduced to
226        // the same relative position: the smallest power of 2 that is greater than them
227        (relative as f64).log2().ceil().exp2() as u16
228    }
229}
230
231/// Transform a raw obkv store into a JSON Object.
232pub fn obkv_to_json(
233    displayed_fields: &[FieldId],
234    fields_ids_map: &FieldsIdsMap,
235    obkv: &obkv::KvReaderU16,
236) -> Result<Object> {
237    displayed_fields
238        .iter()
239        .copied()
240        .flat_map(|id| obkv.get(id).map(|value| (id, value)))
241        .map(|(id, value)| {
242            let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId {
243                field_id: id,
244                process: "obkv_to_json",
245            })?;
246            let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?;
247            Ok((name.to_owned(), value))
248        })
249        .collect()
250}
251
252/// Transform every field of a raw obkv store into a JSON Object.
253pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result<Object> {
254    let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
255    obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv)
256}
257
258/// Transform a JSON value into a string that can be indexed.
259pub fn json_to_string(value: &Value) -> Option<String> {
260    fn inner(value: &Value, output: &mut String) -> bool {
261        use std::fmt::Write;
262        match value {
263            Value::Null => false,
264            Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
265            Value::Number(number) => write!(output, "{}", number).is_ok(),
266            Value::String(string) => write!(output, "{}", string).is_ok(),
267            Value::Array(array) => {
268                let mut count = 0;
269                for value in array {
270                    if inner(value, output) {
271                        output.push_str(". ");
272                        count += 1;
273                    }
274                }
275                // check that at least one value was written
276                count != 0
277            }
278            Value::Object(object) => {
279                let mut buffer = String::new();
280                let mut count = 0;
281                for (key, value) in object {
282                    buffer.clear();
283                    let _ = write!(&mut buffer, "{}: ", key);
284                    if inner(value, &mut buffer) {
285                        buffer.push_str(". ");
286                        // We write the "key: value. " pair only when
287                        // we are sure that the value can be written.
288                        output.push_str(&buffer);
289                        count += 1;
290                    }
291                }
292                // check that at least one value was written
293                count != 0
294            }
295        }
296    }
297
298    let mut string = String::new();
299    if inner(value, &mut string) {
300        Some(string)
301    } else {
302        None
303    }
304}
305
306/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
307fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
308    if mid <= slice.len() {
309        Some(slice.split_at(mid))
310    } else {
311        None
312    }
313}
314
315/// Divides one slice into an array and the tail at an index,
316/// returns `None` if `N` is out of bounds.
317fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
318where
319    [T; N]: for<'a> TryFrom<&'a [T]>,
320{
321    let (head, tail) = try_split_at(slice, N)?;
322    let head = head.try_into().ok()?;
323    Some((head, tail))
324}
325
326/// Return the distance between two points in meters. Each points are composed of two f64,
327/// one latitude and one longitude.
328pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 {
329    let a = geoutils::Location::new(a[0], a[1]);
330    let b = geoutils::Location::new(b[0], b[1]);
331
332    a.haversine_distance_to(&b).meters()
333}
334
335/// Convert a point expressed in terms of latitude and longitude to a point in the
336/// cartesian coordinate expressed in terms of x, y and z.
337pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] {
338    let [lat, lng] = coord.map(|f| f.to_radians());
339    let x = lat.cos() * lng.cos();
340    let y = lat.cos() * lng.sin();
341    let z = lat.sin();
342
343    [x, y, z]
344}
345
346/// Returns `true` if the field match one of the faceted fields.
347/// See the function [`is_faceted_by`] below to see what “matching” means.
348pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool {
349    faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref()))
350}
351
352/// Returns `true` if the field match the facet.
353/// ```
354/// use milli_core::is_faceted_by;
355/// // -- the valid basics
356/// assert!(is_faceted_by("animaux", "animaux"));
357/// assert!(is_faceted_by("animaux.chien", "animaux"));
358/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux"));
359/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien"));
360/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois"));
361/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure"));
362/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur"));
363///
364/// // -- the wrongs
365/// assert!(!is_faceted_by("chien", "chat"));
366/// assert!(!is_faceted_by("animaux", "animaux.chien"));
367/// assert!(!is_faceted_by("animaux.chien", "animaux.chat"));
368///
369/// // -- the strange edge cases
370/// assert!(!is_faceted_by("animaux.chien", "anima"));
371/// assert!(!is_faceted_by("animaux.chien", "animau"));
372/// assert!(!is_faceted_by("animaux.chien", "animaux."));
373/// assert!(!is_faceted_by("animaux.chien", "animaux.c"));
374/// assert!(!is_faceted_by("animaux.chien", "animaux.ch"));
375/// assert!(!is_faceted_by("animaux.chien", "animaux.chi"));
376/// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
377/// ```
378pub fn is_faceted_by(field: &str, facet: &str) -> bool {
379    field.starts_with(facet) && field[facet.len()..].chars().next().is_none_or(|c| c == '.')
380}
381
382pub fn normalize_facet(original: &str) -> String {
383    CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
384}
385
386#[cfg(test)]
387mod tests {
388    use serde_json::json;
389
390    use super::*;
391
392    #[test]
393    fn json_to_string_object() {
394        let value = json!({
395            "name": "John Doe",
396            "age": 43,
397            "not_there": null,
398        });
399
400        let string = json_to_string(&value).unwrap();
401        assert_eq!(string, "name: John Doe. age: 43. ");
402    }
403
404    #[test]
405    fn json_to_string_array() {
406        let value = json!([
407            { "name": "John Doe" },
408            43,
409            "hello",
410            [ "I", "am", "fine" ],
411            null,
412        ]);
413
414        let string = json_to_string(&value).unwrap();
415        // We don't care about having two point (.) after the other as
416        // the distance of hard separators is clamped to 8 anyway.
417        assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
418    }
419
420    #[test]
421    fn test_relative_position_conversion() {
422        assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
423        assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
424        assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
425        assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
426        assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
427        assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
428        assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
429    }
430
431    #[test]
432    fn test_absolute_position_conversion() {
433        assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
434        assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
435        assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
436        assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
437        assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
438        assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
439        assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
440    }
441
442    #[test]
443    fn test_all_obkv_to_json() {
444        let mut fields_ids_map = FieldsIdsMap::new();
445        let id1 = fields_ids_map.insert("field1").unwrap();
446        let id2 = fields_ids_map.insert("field2").unwrap();
447
448        let mut writer = obkv::KvWriterU16::memory();
449        writer.insert(id1, b"1234").unwrap();
450        writer.insert(id2, b"4321").unwrap();
451        let contents = writer.into_inner().unwrap();
452        let obkv = obkv::KvReaderU16::from_slice(&contents);
453
454        let expected = json!({
455            "field1": 1234,
456            "field2": 4321,
457        });
458        let expected = expected.as_object().unwrap();
459        let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
460
461        assert_eq!(&actual, expected);
462    }
463}