milli-core 1.15.1

Meilisearch HTTP server
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
use std::cmp::{Ordering, Reverse};
use std::collections::BTreeMap;
use std::hash::{Hash, Hasher};

use fxhash::{FxHashMap, FxHasher};
use roaring::RoaringBitmap;

use super::interner::{FixedSizeInterner, Interned};
use super::query_term::{
    self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset,
};
use super::small_bitmap::SmallBitmap;
use super::SearchContext;
use crate::search::new::interner::Interner;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::Result;

/// A node of the [`QueryGraph`].
///
/// There are four types of nodes:
/// 1. `Start` : unique, represents the start of the query
/// 2. `End` : unique, represents the end of a query
/// 3. `Deleted` : represents a node that was deleted.
///    All deleted nodes are unreachable from the start node.
/// 4. `Term` is a regular node representing a word or combination of words
///    from the user query.
#[derive(Clone)]
pub struct QueryNode {
    pub data: QueryNodeData,
    pub predecessors: SmallBitmap<QueryNode>,
    pub successors: SmallBitmap<QueryNode>,
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum QueryNodeData {
    Term(LocatedQueryTermSubset),
    Deleted,
    Start,
    End,
}

/**
A graph representing all the ways to interpret the user's search query.

## Example 1
For the search query `sunflower`, we need to register the following things:
- we need to look for the exact word `sunflower`
- but also any word which is 1 or 2 typos apart from `sunflower`
- and every word that contains the prefix `sunflower`
- and also the couple of adjacent words `sun flower`
- as well as all the user-defined synonyms of `sunflower`

All these derivations of a word will be stored in [`QueryTerm`].

## Example 2:
For the search query `summer house by`.

We also look for all word derivations of each term. And we also need to consider
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
Furthermore, we need to know which words these ngrams replace. This is done by creating the
following graph, where each node also contains a list of derivations:
```txt
                        ┌───────┐
                      ┌─│houseby│─────────┐
                      │ └───────┘         │
┌───────┐   ┌───────┐ │ ┌───────┐  ┌────┐ │ ┌───────┐
│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│  END  │
└───────┘ │ └───────┘   └───────┘│ └────┘ │ └───────┘
          │ ┌────────────┐       │        │
          ├─│summerhouse │───────┘        │
          │ └────────────┘                │
          │         ┌─────────────┐       │
          └─────────│summerhouseby│───────┘
                    └─────────────┘
```
Note also that each node has a range of positions associated with it,
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
is registered with the positions `1..=2`. When two nodes are connected by an edge,
it means that they are potentially next to each other in the user's search query
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
and the transformations that were done on the query graph).
*/
#[derive(Clone)]
pub struct QueryGraph {
    /// The index of the start node within `self.nodes`
    pub root_node: Interned<QueryNode>,
    /// The index of the end node within `self.nodes`
    pub end_node: Interned<QueryNode>,
    /// The list of all query nodes
    pub nodes: FixedSizeInterner<QueryNode>,
}

impl QueryGraph {
    /// Build the query graph from the parsed user search query, return an updated list of the located query terms
    /// which contains ngrams.
    pub fn from_query(
        ctx: &mut SearchContext<'_>,
        // The terms here must be consecutive
        terms: &[LocatedQueryTerm],
    ) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
        let mut new_located_query_terms = terms.to_vec();

        let nbr_typos = number_of_typos_allowed(ctx)?;

        let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
        let root_node = 0;
        let end_node = 1;

        // Ee could consider generalizing to 4,5,6,7,etc. ngrams
        let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
            (vec![], vec![], vec![root_node]);

        let original_terms_len = terms.len();
        for term_idx in 0..original_terms_len {
            let mut new_nodes = vec![];

            let new_node_idx = add_node(
                &mut nodes_data,
                QueryNodeData::Term(LocatedQueryTermSubset {
                    term_subset: QueryTermSubset::full(terms[term_idx].value),
                    positions: terms[term_idx].positions.clone(),
                    term_ids: term_idx as u8..=term_idx as u8,
                }),
            );
            new_nodes.push(new_node_idx);

            if !prev1.is_empty() {
                if let Some(ngram) =
                    query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
                {
                    new_located_query_terms.push(ngram.clone());
                    let ngram_idx = add_node(
                        &mut nodes_data,
                        QueryNodeData::Term(LocatedQueryTermSubset {
                            term_subset: QueryTermSubset::full(ngram.value),
                            positions: ngram.positions,
                            term_ids: term_idx as u8 - 1..=term_idx as u8,
                        }),
                    );
                    new_nodes.push(ngram_idx);
                }
            }
            if !prev2.is_empty() {
                if let Some(ngram) =
                    query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
                {
                    new_located_query_terms.push(ngram.clone());
                    let ngram_idx = add_node(
                        &mut nodes_data,
                        QueryNodeData::Term(LocatedQueryTermSubset {
                            term_subset: QueryTermSubset::full(ngram.value),
                            positions: ngram.positions,
                            term_ids: term_idx as u8 - 2..=term_idx as u8,
                        }),
                    );
                    new_nodes.push(ngram_idx);
                }
            }
            (prev0, prev1, prev2) = (new_nodes, prev0, prev1);
        }

        let root_node = Interned::from_raw(root_node);
        let end_node = Interned::from_raw(end_node);
        let mut nodes = FixedSizeInterner::new(
            nodes_data.len() as u16,
            QueryNode {
                data: QueryNodeData::Deleted,
                predecessors: SmallBitmap::new(nodes_data.len() as u16),
                successors: SmallBitmap::new(nodes_data.len() as u16),
            },
        );
        for (node_idx, node_data) in nodes_data.into_iter().enumerate() {
            let node = nodes.get_mut(Interned::from_raw(node_idx as u16));
            node.data = node_data;
        }
        let mut graph = QueryGraph { root_node, end_node, nodes };
        graph.build_initial_edges();

        Ok((graph, new_located_query_terms))
    }

    /// Remove the given nodes, connecting all their predecessors to all their successors.
    pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned<QueryNode>]) {
        for &node_id in nodes {
            let node = self.nodes.get(node_id);
            let old_node_pred = node.predecessors.clone();
            let old_node_succ = node.successors.clone();
            for pred in old_node_pred.iter() {
                let pred_successors = &mut self.nodes.get_mut(pred).successors;
                pred_successors.remove(node_id);
                pred_successors.union(&old_node_succ);
            }
            for succ in old_node_succ.iter() {
                let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors;
                succ_predecessors.remove(node_id);
                succ_predecessors.union(&old_node_pred);
            }
            let node = self.nodes.get_mut(node_id);
            node.data = QueryNodeData::Deleted;
            node.predecessors.clear();
            node.successors.clear();
        }
    }

    /// Remove the given nodes and all their edges from the query graph.
    pub fn remove_nodes(&mut self, nodes: &[Interned<QueryNode>]) {
        for &node_id in nodes {
            let node = &self.nodes.get(node_id);
            let old_node_pred = node.predecessors.clone();
            let old_node_succ = node.successors.clone();

            for pred in old_node_pred.iter() {
                self.nodes.get_mut(pred).successors.remove(node_id);
            }
            for succ in old_node_succ.iter() {
                self.nodes.get_mut(succ).predecessors.remove(node_id);
            }

            let node = self.nodes.get_mut(node_id);
            node.data = QueryNodeData::Deleted;
            node.predecessors.clear();
            node.successors.clear();
        }
    }
    /// Simplify the query graph by removing all nodes that are disconnected from
    /// the start or end nodes.
    pub fn simplify(&mut self) {
        loop {
            let mut nodes_to_remove = vec![];
            for (node_idx, node) in self.nodes.iter() {
                if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted)
                    && node.successors.is_empty())
                    || (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted)
                        && node.predecessors.is_empty())
                {
                    nodes_to_remove.push(node_idx);
                }
            }
            if nodes_to_remove.is_empty() {
                break;
            } else {
                self.remove_nodes(&nodes_to_remove);
            }
        }
    }

    fn build_initial_edges(&mut self) {
        for (_, node) in self.nodes.iter_mut() {
            node.successors.clear();
            node.predecessors.clear();
        }
        for node_id in self.nodes.indexes() {
            let node = self.nodes.get(node_id);
            let end_prev_term_id = match &node.data {
                QueryNodeData::Term(term) => *term.term_ids.end() as i16,
                QueryNodeData::Start => -1,
                QueryNodeData::Deleted => continue,
                QueryNodeData::End => continue,
            };
            let successors = {
                let mut successors = SmallBitmap::for_interned_values_in(&self.nodes);
                let mut min = i16::MAX;
                for (node_id, node) in self.nodes.iter() {
                    let start_next_term_id = match &node.data {
                        QueryNodeData::Term(term) => *term.term_ids.start() as i16,
                        QueryNodeData::End => i16::MAX,
                        QueryNodeData::Start => continue,
                        QueryNodeData::Deleted => continue,
                    };
                    if start_next_term_id <= end_prev_term_id {
                        continue;
                    }
                    match start_next_term_id.cmp(&min) {
                        Ordering::Less => {
                            min = start_next_term_id;
                            successors.clear();
                            successors.insert(node_id);
                        }
                        Ordering::Equal => {
                            successors.insert(node_id);
                        }
                        Ordering::Greater => continue,
                    }
                }
                successors
            };
            let node = self.nodes.get_mut(node_id);
            node.successors = successors.clone();
            for successor in successors.iter() {
                let successor = self.nodes.get_mut(successor);
                successor.predecessors.insert(node_id);
            }
        }
    }

    pub fn removal_order_for_terms_matching_strategy_frequency(
        &self,
        ctx: &mut SearchContext<'_>,
    ) -> Result<Vec<SmallBitmap<QueryNode>>> {
        // lookup frequency for each term
        let mut term_with_frequency: Vec<(u8, u64)> = {
            let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
            for (_, node) in self.nodes.iter() {
                match &node.data {
                    QueryNodeData::Term(t) => {
                        let docids = compute_query_term_subset_docids(ctx, None, &t.term_subset)?;
                        for id in t.term_ids.clone() {
                            term_docids
                                .entry(id)
                                .and_modify(|curr| *curr |= &docids)
                                .or_insert_with(|| docids.clone());
                        }
                    }
                    QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
                }
            }
            term_docids
                .into_iter()
                .map(|(idx, docids)| match docids.len() {
                    0 => (idx, u64::MAX),
                    frequency => (idx, frequency),
                })
                .collect()
        };
        term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
        let mut term_weight = BTreeMap::new();
        let mut weight: u16 = 1;
        let mut peekable = term_with_frequency.into_iter().peekable();
        while let Some((idx, frequency)) = peekable.next() {
            term_weight.insert(idx, weight);
            if peekable.peek().is_some_and(|(_, f)| frequency != *f) {
                weight += 1;
            }
        }
        let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
        Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
    }

    pub fn removal_order_for_terms_matching_strategy_last(
        &self,
        ctx: &SearchContext<'_>,
    ) -> Vec<SmallBitmap<QueryNode>> {
        let (first_term_idx, last_term_idx) = {
            let mut first_term_idx = u8::MAX;
            let mut last_term_idx = 0u8;
            for (_, node) in self.nodes.iter() {
                match &node.data {
                    QueryNodeData::Term(t) => {
                        if *t.term_ids.end() > last_term_idx {
                            last_term_idx = *t.term_ids.end();
                        }
                        if *t.term_ids.start() < first_term_idx {
                            first_term_idx = *t.term_ids.start();
                        }
                    }
                    QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
                }
            }
            (first_term_idx, last_term_idx)
        };
        if first_term_idx >= last_term_idx {
            return vec![];
        }

        let cost_of_term_idx = |term_idx: u8| {
            let rank = 1 + last_term_idx - term_idx;
            rank as u16
        };
        self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
    }

    pub fn removal_order_for_terms_matching_strategy(
        &self,
        ctx: &SearchContext<'_>,
        order: impl Fn(u8) -> u16,
    ) -> Vec<SmallBitmap<QueryNode>> {
        let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
        let mut at_least_one_mandatory_term = false;
        for (node_id, node) in self.nodes.iter() {
            let QueryNodeData::Term(t) = &node.data else { continue };
            if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() {
                at_least_one_mandatory_term = true;
                continue;
            }
            let mut cost = 0;
            for id in t.term_ids.clone() {
                cost = std::cmp::max(cost, order(id));
            }
            nodes_to_remove
                .entry(cost)
                .or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes))
                .insert(node_id);
        }
        let mut res: Vec<_> = nodes_to_remove.into_values().collect();
        if !at_least_one_mandatory_term {
            res.pop();
        }
        res
    }

    /// Number of words in the phrases in this query graph
    pub(crate) fn words_in_phrases_count(&self, ctx: &SearchContext<'_>) -> usize {
        let mut word_count = 0;
        for (_, node) in self.nodes.iter() {
            match &node.data {
                QueryNodeData::Term(term) => {
                    let Some(phrase) = term.term_subset.original_phrase(ctx) else { continue };
                    let phrase = ctx.phrase_interner.get(phrase);
                    word_count += phrase.words.iter().copied().filter(|a| a.is_some()).count()
                }
                _ => continue,
            }
        }
        word_count
    }
}

fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
    let new_node_idx = nodes_data.len() as u16;
    nodes_data.push(node_data);
    new_node_idx
}

impl QueryGraph {
    /*
    Build a query graph from a list of paths

    The paths are composed of source and dest terms.

    For example, consider the following paths:
    ```txt
    PATH 1 :  a -> b1 -> c1 -> d -> e1
    PATH 2 :  a -> b2 -> c2 -> d -> e2
    ```
    Then the resulting graph will be:
    ```txt
              ┌────┐  ┌────┐   ┌────┐   ┌────┐
           ┌──│ b1 │──│ c1 │───│ d  │───│ e1 │
    ┌────┐ │  └────┘  └────┘   └────┘   └────┘
    │ a  │─┤
    └────┘ │  ┌────┐  ┌────┐   ┌────┐   ┌────┐
           └──│ b2 │──│ c2 │───│ d  │───│ e2 │
              └────┘  └────┘   └────┘   └────┘
    ```
    */
    pub fn build_from_paths(
        paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
    ) -> Self {
        let mut node_data = Interner::default();
        let root_node = node_data.push(QueryNodeData::Start);
        let end_node = node_data.push(QueryNodeData::End);

        let mut paths_with_single_terms = vec![];

        for path in paths {
            let mut processed_path = vec![];
            let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
            for (start_term, dest_term) in path {
                if let Some(prev_dest_term) = prev_dest_term.take() {
                    if let Some(mut start_term) = start_term {
                        if start_term.term_ids == prev_dest_term.term_ids {
                            start_term.term_subset.intersect(&prev_dest_term.term_subset);
                            processed_path.push(start_term);
                        } else {
                            processed_path.push(prev_dest_term);
                            processed_path.push(start_term);
                        }
                    } else {
                        processed_path.push(prev_dest_term);
                    }
                } else if let Some(start_term) = start_term {
                    processed_path.push(start_term);
                }
                prev_dest_term = Some(dest_term);
            }
            if let Some(prev_dest_term) = prev_dest_term {
                processed_path.push(prev_dest_term);
            }
            paths_with_single_terms.push(processed_path);
        }

        let mut paths_with_single_terms_and_suffix_hash = vec![];
        for path in paths_with_single_terms {
            let mut hasher = FxHasher::default();
            let mut path_with_hash = vec![];
            for term in path.into_iter().rev() {
                term.hash(&mut hasher);
                path_with_hash.push((term, hasher.finish()));
            }
            path_with_hash.reverse();
            paths_with_single_terms_and_suffix_hash.push(path_with_hash);
        }

        let mut node_data_id_for_term_and_suffix_hash =
            FxHashMap::<(LocatedQueryTermSubset, u64), Interned<QueryNodeData>>::default();

        let mut paths_with_ids = vec![];
        for path in paths_with_single_terms_and_suffix_hash {
            let mut path_with_ids = vec![];
            for (term, suffix_hash) in path {
                let node_data_id = node_data_id_for_term_and_suffix_hash
                    .entry((term.clone(), suffix_hash))
                    .or_insert_with(|| node_data.push(QueryNodeData::Term(term)));
                path_with_ids.push(Interned::from_raw(node_data_id.into_raw()));
            }
            paths_with_ids.push(path_with_ids);
        }

        let nodes_data = node_data.freeze();
        let nodes_data_len = nodes_data.len();
        let mut nodes = nodes_data.map_move(|n| QueryNode {
            data: n,
            predecessors: SmallBitmap::new(nodes_data_len),
            successors: SmallBitmap::new(nodes_data_len),
        });

        let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
        let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());

        for path in paths_with_ids {
            let mut prev_node_id = root_node;
            for node_id in path {
                let prev_node = nodes.get_mut(prev_node_id);
                prev_node.successors.insert(node_id);
                let node = nodes.get_mut(node_id);
                node.predecessors.insert(prev_node_id);
                prev_node_id = node_id;
            }
            let prev_node = nodes.get_mut(prev_node_id);
            prev_node.successors.insert(end_node);
            let node = nodes.get_mut(end_node);
            node.predecessors.insert(prev_node_id);
        }

        QueryGraph { root_node, end_node, nodes }
    }
}