milli_core/update/index_documents/
mod.rs

1mod enrich;
2mod extract;
3mod helpers;
4mod transform;
5mod typed_chunk;
6
7use std::collections::HashSet;
8use std::io::{Read, Seek};
9use std::iter;
10use std::num::NonZeroU32;
11use std::sync::Arc;
12
13use crossbeam_channel::{Receiver, Sender};
14use enrich::enrich_documents_batch;
15use grenad::{Merger, MergerBuilder};
16use hashbrown::HashMap;
17use heed::types::Str;
18use heed::Database;
19use rand::SeedableRng as _;
20use roaring::RoaringBitmap;
21use serde::{Deserialize, Serialize};
22use slice_group_by::GroupBy;
23use tracing::debug;
24use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
25
26pub use self::enrich::{extract_finite_float_from_value, DocumentId};
27pub use self::helpers::*;
28pub use self::transform::{Transform, TransformOutput};
29use super::facet::clear_facet_levels_based_on_settings_diff;
30use super::new::StdResult;
31use crate::database_stats::DatabaseStats;
32use crate::documents::{obkv_to_object, DocumentsBatchReader};
33use crate::error::{Error, InternalError};
34use crate::index::{PrefixSearch, PrefixSettings};
35use crate::progress::Progress;
36use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
37pub use crate::update::index_documents::helpers::CursorClonableMmap;
38use crate::update::{
39    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
40};
41use crate::vector::{ArroyWrapper, EmbeddingConfigs};
42use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
43
44static MERGED_DATABASE_COUNT: usize = 7;
45static PREFIX_DATABASE_COUNT: usize = 4;
46static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
47
48#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
49pub struct DocumentAdditionResult {
50    /// The number of documents that were indexed during the update
51    pub indexed_documents: u64,
52    /// The total number of documents in the index after the update
53    pub number_of_documents: u64,
54}
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
57#[non_exhaustive]
58pub enum IndexDocumentsMethod {
59    /// Replace the previous document with the new one,
60    /// removing all the already known attributes.
61    ReplaceDocuments,
62
63    /// Merge the previous version of the document with the new version,
64    /// replacing old attributes values with the new ones and add the new attributes.
65    UpdateDocuments,
66}
67
68impl Default for IndexDocumentsMethod {
69    fn default() -> Self {
70        Self::ReplaceDocuments
71    }
72}
73
74pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
75    wtxn: &'t mut heed::RwTxn<'i>,
76    index: &'i Index,
77    config: IndexDocumentsConfig,
78    indexer_config: &'a IndexerConfig,
79    transform: Option<Transform<'a, 'i>>,
80    progress: FP,
81    should_abort: FA,
82    added_documents: u64,
83    deleted_documents: u64,
84    embedders: EmbeddingConfigs,
85}
86
87#[derive(Default, Debug, Clone)]
88pub struct IndexDocumentsConfig {
89    pub words_positions_level_group_size: Option<NonZeroU32>,
90    pub words_positions_min_level_size: Option<NonZeroU32>,
91    pub update_method: IndexDocumentsMethod,
92    pub autogenerate_docids: bool,
93}
94
95impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
96where
97    FP: Fn(UpdateIndexingStep) + Sync + Send,
98    FA: Fn() -> bool + Sync + Send,
99{
100    pub fn new(
101        wtxn: &'t mut heed::RwTxn<'i>,
102        index: &'i Index,
103        indexer_config: &'a IndexerConfig,
104        config: IndexDocumentsConfig,
105        progress: FP,
106        should_abort: FA,
107    ) -> Result<IndexDocuments<'t, 'i, 'a, FP, FA>> {
108        let transform = Some(Transform::new(
109            wtxn,
110            index,
111            indexer_config,
112            config.update_method,
113            config.autogenerate_docids,
114        )?);
115
116        Ok(IndexDocuments {
117            transform,
118            config,
119            indexer_config,
120            progress,
121            should_abort,
122            wtxn,
123            index,
124            added_documents: 0,
125            deleted_documents: 0,
126            embedders: Default::default(),
127        })
128    }
129
130    /// Adds a batch of documents to the current builder.
131    ///
132    /// Since the documents are progressively added to the writer, a failure will cause only
133    /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward.
134    ///
135    /// Returns the number of documents added to the builder.
136    #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
137    pub fn add_documents<R: Read + Seek>(
138        mut self,
139        reader: DocumentsBatchReader<R>,
140    ) -> Result<(Self, StdResult<u64, UserError>)> {
141        // Early return when there is no document to add
142        if reader.is_empty() {
143            return Ok((self, Ok(0)));
144        }
145
146        // We check for user errors in this validator and if there is one, we can return
147        // the `IndexDocument` struct as it is valid to send more documents into it.
148        // However, if there is an internal error we throw it away!
149        let enriched_documents_reader = match enrich_documents_batch(
150            self.wtxn,
151            self.index,
152            self.config.autogenerate_docids,
153            reader,
154        )? {
155            Ok(reader) => reader,
156            Err(user_error) => return Ok((self, Err(user_error))),
157        };
158
159        let indexed_documents =
160            self.transform.as_mut().expect("Invalid document addition state").read_documents(
161                enriched_documents_reader,
162                self.wtxn,
163                &self.progress,
164                &self.should_abort,
165            )? as u64;
166
167        self.added_documents += indexed_documents;
168
169        Ok((self, Ok(indexed_documents)))
170    }
171
172    pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
173        self.embedders = embedders;
174        self
175    }
176
177    #[tracing::instrument(
178        level = "trace"
179        skip_all,
180        target = "indexing::documents",
181        name = "index_documents"
182    )]
183    pub fn execute(mut self) -> Result<DocumentAdditionResult> {
184        if self.added_documents == 0 && self.deleted_documents == 0 {
185            let number_of_documents = self.index.number_of_documents(self.wtxn)?;
186            return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
187        }
188        let output = self
189            .transform
190            .take()
191            .expect("Invalid document addition state")
192            .output_from_sorter(self.wtxn, &self.progress)?;
193
194        let indexed_documents = output.documents_count as u64;
195        let number_of_documents = self.execute_raw(output)?;
196
197        Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
198    }
199
200    /// Returns the total number of documents in the index after the update.
201    #[tracing::instrument(
202        level = "trace",
203        skip_all,
204        target = "indexing::details",
205        name = "index_documents_raw"
206    )]
207    pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
208    where
209        FP: Fn(UpdateIndexingStep) + Sync,
210        FA: Fn() -> bool + Sync,
211    {
212        let TransformOutput {
213            primary_key,
214            mut settings_diff,
215            field_distribution,
216            documents_count,
217            original_documents,
218            flattened_documents,
219        } = output;
220
221        // update the searchable list,
222        // because they might have changed due to the nested documents flattening.
223        settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
224
225        let settings_diff = Arc::new(settings_diff);
226        let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
227
228        let possible_embedding_mistakes =
229            crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
230
231        let backup_pool;
232        let pool = match self.indexer_config.thread_pool {
233            Some(ref pool) => pool,
234            None => {
235                // We initialize a backup pool with the default
236                // settings if none have already been set.
237                #[allow(unused_mut)]
238                let mut pool_builder = ThreadPoolNoAbortBuilder::new();
239
240                #[cfg(test)]
241                {
242                    pool_builder = pool_builder.num_threads(1);
243                }
244
245                backup_pool = pool_builder.build()?;
246                &backup_pool
247            }
248        };
249
250        // create LMDB writer channel
251        let (lmdb_writer_sx, lmdb_writer_rx): (
252            Sender<Result<TypedChunk>>,
253            Receiver<Result<TypedChunk>>,
254        ) = crossbeam_channel::unbounded();
255
256        // get the primary key field id
257        let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
258
259        let pool_params = GrenadParameters {
260            chunk_compression_type: self.indexer_config.chunk_compression_type,
261            chunk_compression_level: self.indexer_config.chunk_compression_level,
262            max_memory: self.indexer_config.max_memory,
263            max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
264        };
265        let documents_chunk_size = match self.indexer_config.documents_chunk_size {
266            Some(chunk_size) => chunk_size,
267            None => {
268                let default_chunk_size = 1024 * 1024 * 4; // 4MiB
269                let min_chunk_size = 1024 * 512; // 512KiB
270
271                // compute the chunk size from the number of available threads and the inputed data size.
272                let total_size = match flattened_documents.as_ref() {
273                    Some(flattened_documents) => flattened_documents.metadata().map(|m| m.len()),
274                    None => Ok(default_chunk_size as u64),
275                };
276                let current_num_threads = pool.current_num_threads();
277                // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
278                let chunk_count = if current_num_threads > 2 {
279                    (current_num_threads * 3 / 4).max(2)
280                } else {
281                    current_num_threads
282                };
283                total_size
284                    .map_or(default_chunk_size, |size| (size as usize) / chunk_count)
285                    .max(min_chunk_size)
286            }
287        };
288
289        let original_documents = match original_documents {
290            Some(original_documents) => Some(grenad::Reader::new(original_documents)?),
291            None => None,
292        };
293        let flattened_documents = match flattened_documents {
294            Some(flattened_documents) => Some(grenad::Reader::new(flattened_documents)?),
295            None => None,
296        };
297
298        let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
299
300        let mut final_documents_ids = RoaringBitmap::new();
301        let mut databases_seen = 0;
302        let mut word_position_docids = None;
303        let mut word_fid_docids = None;
304        let mut word_docids = None;
305        let mut exact_word_docids = None;
306        let mut chunk_accumulator = ChunkAccumulator::default();
307        let mut dimension = HashMap::new();
308
309        let current_span = tracing::Span::current();
310
311        // Run extraction pipeline in parallel.
312        let mut modified_docids = RoaringBitmap::new();
313        pool.install(|| {
314                let settings_diff_cloned = settings_diff.clone();
315                rayon::spawn(move || {
316                    let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
317                    let _enter = child_span.enter();
318
319                    // split obkv file into several chunks
320                    let original_chunk_iter = match original_documents {
321                        Some(original_documents) => {
322                            grenad_obkv_into_chunks(original_documents,pool_params,documents_chunk_size).map(either::Left)
323                        },
324                        None => Ok(either::Right(iter::empty())),
325                    };
326
327                    // split obkv file into several chunks
328                    let flattened_chunk_iter = match flattened_documents {
329                        Some(flattened_documents) => {
330                            grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size).map(either::Left)
331                        },
332                        None => Ok(either::Right(iter::empty())),
333                    };
334
335                    let result = original_chunk_iter.and_then(|original_chunk| {
336                        let flattened_chunk = flattened_chunk_iter?;
337                        // extract all databases from the chunked obkv douments
338                        extract::data_from_obkv_documents(
339                            original_chunk,
340                            flattened_chunk,
341                            pool_params,
342                            lmdb_writer_sx.clone(),
343                            primary_key_id,
344                            embedders_configs.clone(),
345                            settings_diff_cloned,
346                            max_positions_per_attributes,
347                            Arc::new(possible_embedding_mistakes)
348                        )
349                    });
350
351                    if let Err(e) = result {
352                        let _ = lmdb_writer_sx.send(Err(e));
353                    }
354
355                    // needs to be dropped to avoid channel waiting lock.
356                    drop(lmdb_writer_sx);
357                });
358
359                (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
360                    databases_seen,
361                    total_databases: TOTAL_POSTING_DATABASE_COUNT,
362                });
363
364                loop {
365                    if (self.should_abort)() {
366                        return Err(Error::InternalError(InternalError::AbortedIndexation));
367                    }
368
369                    match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
370                        Err(status) => {
371                            if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
372                                let (docids, is_merged_database) =
373                                    write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?;
374                                if !docids.is_empty() {
375                                    final_documents_ids |= docids;
376                                    let documents_seen_count = final_documents_ids.len();
377                                    (self.progress)(UpdateIndexingStep::IndexDocuments {
378                                        documents_seen: documents_seen_count as usize,
379                                        total_documents: documents_count,
380                                    });
381                                    debug!(documents = documents_seen_count, total = documents_count, "Seen");
382                                }
383                                if is_merged_database {
384                                    databases_seen += 1;
385                                    (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
386                                        databases_seen,
387                                        total_databases: TOTAL_POSTING_DATABASE_COUNT,
388                                    });
389                                }
390                            // If no more chunk remains in the chunk accumulator and the channel is disconected, break.
391                            } else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
392                                break;
393                            } else {
394                                rayon::yield_now();
395                            }
396                        }
397                        Ok(result) => {
398                            let typed_chunk = match result? {
399                                TypedChunk::WordDocids {
400                                    word_docids_reader,
401                                    exact_word_docids_reader,
402                                    word_fid_docids_reader,
403                                } => {
404                                    let cloneable_chunk =
405                                        unsafe { as_cloneable_grenad(&word_docids_reader)? };
406                                    let word_docids = word_docids.get_or_insert_with(|| {
407                                        MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
408                                    });
409                                    word_docids.push(cloneable_chunk.into_cursor()?);
410                                    let cloneable_chunk =
411                                        unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
412                                    let exact_word_docids =
413                                        exact_word_docids.get_or_insert_with(|| {
414                                            MergerBuilder::new(
415                                                MergeDeladdCboRoaringBitmaps,
416                                            )
417                                        });
418                                    exact_word_docids.push(cloneable_chunk.into_cursor()?);
419                                    let cloneable_chunk =
420                                        unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
421                                    let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
422                                        MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
423                                    });
424                                    word_fid_docids.push(cloneable_chunk.into_cursor()?);
425                                    TypedChunk::WordDocids {
426                                        word_docids_reader,
427                                        exact_word_docids_reader,
428                                        word_fid_docids_reader,
429                                    }
430                                }
431                                TypedChunk::WordPositionDocids(chunk) => {
432                                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
433                                    let word_position_docids =
434                                        word_position_docids.get_or_insert_with(|| {
435                                            MergerBuilder::new(
436                                                MergeDeladdCboRoaringBitmaps,
437                                            )
438                                        });
439                                    word_position_docids.push(cloneable_chunk.into_cursor()?);
440                                    TypedChunk::WordPositionDocids(chunk)
441                                }
442                                TypedChunk::VectorPoints {
443                                    expected_dimension,
444                                    remove_vectors,
445                                    embeddings,
446                                    manual_vectors,
447                                    embedder_name,
448                                    add_to_user_provided,
449                                    remove_from_user_provided,
450                                } => {
451                                    dimension.insert(embedder_name.clone(), expected_dimension);
452                                    TypedChunk::VectorPoints {
453                                        remove_vectors,
454                                        embeddings,
455                                        expected_dimension,
456                                        manual_vectors,
457                                        embedder_name,
458                                        add_to_user_provided,
459                                        remove_from_user_provided,
460                                    }
461                                }
462                                otherwise => otherwise,
463                            };
464
465                            chunk_accumulator.insert(typed_chunk);
466                        }
467                    }
468                }
469
470                // If the settings are only being updated, we may have to clear some of the facet levels.
471                if settings_diff.settings_update_only() {
472                    clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?;
473                }
474
475                Ok(())
476            }).map_err(InternalError::from)??;
477
478        if !settings_diff.settings_update_only {
479            // Update the stats of the documents database when there is a document update.
480            let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?;
481            self.index.put_documents_stats(self.wtxn, stats)?;
482        }
483        // We write the field distribution into the main database
484        self.index.put_field_distribution(self.wtxn, &field_distribution)?;
485
486        // We write the primary key field id into the main database
487        self.index.put_primary_key(self.wtxn, &primary_key)?;
488        let number_of_documents = self.index.number_of_documents(self.wtxn)?;
489        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
490
491        // If an embedder wasn't used in the typedchunk but must be binary quantized
492        // we should insert it in `dimension`
493        for (name, action) in settings_diff.embedding_config_updates.iter() {
494            if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
495                let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or(
496                    InternalError::DatabaseMissingEntry {
497                        db_name: "embedder_category_id",
498                        key: None,
499                    },
500                )?;
501                let reader =
502                    ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
503                let dim = reader.dimensions(self.wtxn)?;
504                dimension.insert(name.to_string(), dim);
505            }
506        }
507
508        for (embedder_name, dimension) in dimension {
509            let wtxn = &mut *self.wtxn;
510            let vector_arroy = self.index.vector_arroy;
511            let cancel = &self.should_abort;
512
513            let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
514                InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
515            )?;
516            let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name);
517            let was_quantized =
518                settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2);
519            let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
520
521            pool.install(|| {
522                let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
523                writer.build_and_quantize(
524                    wtxn,
525                    // In the settings we don't have any progress to share
526                    &Progress::default(),
527                    &mut rng,
528                    dimension,
529                    is_quantizing,
530                    self.indexer_config.max_memory,
531                    cancel,
532                )?;
533                Result::Ok(())
534            })
535            .map_err(InternalError::from)??;
536        }
537
538        self.execute_prefix_databases(
539            word_docids.map(MergerBuilder::build),
540            exact_word_docids.map(MergerBuilder::build),
541            word_position_docids.map(MergerBuilder::build),
542            word_fid_docids.map(MergerBuilder::build),
543        )?;
544
545        Ok(number_of_documents)
546    }
547
548    #[tracing::instrument(
549        level = "trace",
550        skip_all,
551        target = "indexing::prefix",
552        name = "index_documents_prefix_databases"
553    )]
554    pub fn execute_prefix_databases(
555        self,
556        word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
557        exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
558        word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
559        word_fid_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
560    ) -> Result<()>
561    where
562        FP: Fn(UpdateIndexingStep) + Sync,
563        FA: Fn() -> bool + Sync,
564    {
565        // Merged databases are already been indexed, we start from this count;
566        let mut databases_seen = MERGED_DATABASE_COUNT;
567
568        if (self.should_abort)() {
569            return Err(Error::InternalError(InternalError::AbortedIndexation));
570        }
571
572        databases_seen += 1;
573        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
574            databases_seen,
575            total_databases: TOTAL_POSTING_DATABASE_COUNT,
576        });
577
578        if (self.should_abort)() {
579            return Err(Error::InternalError(InternalError::AbortedIndexation));
580        }
581
582        let previous_words_prefixes_fst =
583            self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;
584
585        // Run the words prefixes update operation.
586        let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } =
587            self.index.prefix_settings(self.wtxn)?;
588
589        // If the prefix search is enabled at indexing time, we compute the prefixes.
590        if compute_prefixes == PrefixSearch::IndexingTime {
591            let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
592            builder.threshold(prefix_count_threshold);
593            builder.max_prefix_length(max_prefix_length);
594            builder.execute()?;
595        } else {
596            // If the prefix search is disabled at indexing time, we delete the previous words prefixes fst.
597            // And all the associated docids databases.
598            self.index.delete_words_prefixes_fst(self.wtxn)?;
599            self.index.word_prefix_docids.clear(self.wtxn)?;
600            self.index.exact_word_prefix_docids.clear(self.wtxn)?;
601            self.index.word_prefix_position_docids.clear(self.wtxn)?;
602            self.index.word_prefix_fid_docids.clear(self.wtxn)?;
603
604            databases_seen += 3;
605            (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
606                databases_seen,
607                total_databases: TOTAL_POSTING_DATABASE_COUNT,
608            });
609
610            return Ok(());
611        }
612
613        if (self.should_abort)() {
614            return Err(Error::InternalError(InternalError::AbortedIndexation));
615        }
616
617        let current_prefix_fst;
618        let common_prefix_fst_words_tmp;
619        let common_prefix_fst_words: Vec<_>;
620        let new_prefix_fst_words;
621        let del_prefix_fst_words;
622
623        {
624            let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs");
625            let _entered = span.enter();
626
627            current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
628
629            // We retrieve the common words between the previous and new prefix word fst.
630            common_prefix_fst_words_tmp = fst_stream_into_vec(
631                previous_words_prefixes_fst.op().add(&current_prefix_fst).intersection(),
632            );
633            common_prefix_fst_words = common_prefix_fst_words_tmp
634                .as_slice()
635                .linear_group_by_key(|x| x.chars().next().unwrap())
636                .collect();
637
638            // We retrieve the newly added words between the previous and new prefix word fst.
639            new_prefix_fst_words = fst_stream_into_vec(
640                current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
641            );
642
643            // We compute the set of prefixes that are no more part of the prefix fst.
644            del_prefix_fst_words = fst_stream_into_hashset(
645                previous_words_prefixes_fst.op().add(&current_prefix_fst).difference(),
646            );
647        }
648
649        databases_seen += 1;
650        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
651            databases_seen,
652            total_databases: TOTAL_POSTING_DATABASE_COUNT,
653        });
654
655        if (self.should_abort)() {
656            return Err(Error::InternalError(InternalError::AbortedIndexation));
657        }
658
659        if let Some(word_docids) = word_docids {
660            execute_word_prefix_docids(
661                self.wtxn,
662                word_docids,
663                self.index.word_docids,
664                self.index.word_prefix_docids,
665                self.indexer_config,
666                &new_prefix_fst_words,
667                &common_prefix_fst_words,
668                &del_prefix_fst_words,
669            )?;
670        }
671
672        if let Some(exact_word_docids) = exact_word_docids {
673            execute_word_prefix_docids(
674                self.wtxn,
675                exact_word_docids,
676                self.index.exact_word_docids,
677                self.index.exact_word_prefix_docids,
678                self.indexer_config,
679                &new_prefix_fst_words,
680                &common_prefix_fst_words,
681                &del_prefix_fst_words,
682            )?;
683        }
684
685        if (self.should_abort)() {
686            return Err(Error::InternalError(InternalError::AbortedIndexation));
687        }
688
689        databases_seen += 1;
690        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
691            databases_seen,
692            total_databases: TOTAL_POSTING_DATABASE_COUNT,
693        });
694
695        if let Some(word_position_docids) = word_position_docids {
696            // Run the words prefix position docids update operation.
697            let mut builder = WordPrefixIntegerDocids::new(
698                self.wtxn,
699                self.index.word_prefix_position_docids,
700                self.index.word_position_docids,
701            );
702            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
703            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
704            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
705            builder.max_memory = self.indexer_config.max_memory;
706
707            builder.execute(
708                word_position_docids,
709                &new_prefix_fst_words,
710                &common_prefix_fst_words,
711                &del_prefix_fst_words,
712            )?;
713        }
714        if let Some(word_fid_docids) = word_fid_docids {
715            // Run the words prefix fid docids update operation.
716            let mut builder = WordPrefixIntegerDocids::new(
717                self.wtxn,
718                self.index.word_prefix_fid_docids,
719                self.index.word_fid_docids,
720            );
721            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
722            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
723            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
724            builder.max_memory = self.indexer_config.max_memory;
725            builder.execute(
726                word_fid_docids,
727                &new_prefix_fst_words,
728                &common_prefix_fst_words,
729                &del_prefix_fst_words,
730            )?;
731        }
732
733        if (self.should_abort)() {
734            return Err(Error::InternalError(InternalError::AbortedIndexation));
735        }
736
737        databases_seen += 1;
738        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
739            databases_seen,
740            total_databases: TOTAL_POSTING_DATABASE_COUNT,
741        });
742
743        Ok(())
744    }
745}
746
747/// Run the word prefix docids update operation.
748#[allow(clippy::too_many_arguments)]
749#[tracing::instrument(
750    level = "trace",
751    skip_all,
752    target = "indexing::prefix",
753    name = "index_documents_word_prefix_docids"
754)]
755fn execute_word_prefix_docids(
756    txn: &mut heed::RwTxn<'_>,
757    merger: Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
758    word_docids_db: Database<Str, CboRoaringBitmapCodec>,
759    word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
760    indexer_config: &IndexerConfig,
761    new_prefix_fst_words: &[String],
762    common_prefix_fst_words: &[&[String]],
763    del_prefix_fst_words: &HashSet<Vec<u8>>,
764) -> Result<()> {
765    let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
766    builder.chunk_compression_type = indexer_config.chunk_compression_type;
767    builder.chunk_compression_level = indexer_config.chunk_compression_level;
768    builder.max_nb_chunks = indexer_config.max_nb_chunks;
769    builder.max_memory = indexer_config.max_memory;
770    builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
771    Ok(())
772}
773
774#[cfg(test)]
775mod tests {
776    use std::collections::BTreeMap;
777
778    use big_s::S;
779    use bumpalo::Bump;
780    use fst::IntoStreamer;
781    use heed::RwTxn;
782    use maplit::hashset;
783
784    use super::*;
785    use crate::constants::RESERVED_GEO_FIELD_NAME;
786    use crate::documents::mmap_from_objects;
787    use crate::index::tests::TempIndex;
788    use crate::index::IndexEmbeddingConfig;
789    use crate::progress::Progress;
790    use crate::search::TermsMatchingStrategy;
791    use crate::update::new::indexer;
792    use crate::update::Setting;
793    use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError};
794
795    #[test]
796    fn simple_document_replacement() {
797        let index = TempIndex::new();
798
799        // First we send 3 documents with ids from 1 to 3.
800        index
801            .add_documents(documents!([
802                { "id": 1, "name": "kevin" },
803                { "id": 2, "name": "kevina" },
804                { "id": 3, "name": "benoit" }
805            ]))
806            .unwrap();
807
808        // Check that there is 3 documents now.
809        let rtxn = index.read_txn().unwrap();
810        let count = index.number_of_documents(&rtxn).unwrap();
811        assert_eq!(count, 3);
812        drop(rtxn);
813
814        // Second we send 1 document with id 1, to erase the previous ones.
815        index.add_documents(documents!([ { "id": 1, "name": "updated kevin" } ])).unwrap();
816
817        // Check that there is **always** 3 documents.
818        let rtxn = index.read_txn().unwrap();
819        let count = index.number_of_documents(&rtxn).unwrap();
820        assert_eq!(count, 3);
821        drop(rtxn);
822
823        // Third we send 3 documents again to replace the existing ones.
824        index
825            .add_documents(documents!([
826                { "id": 1, "name": "updated second kevin" },
827                { "id": 2, "name": "updated kevina" },
828                { "id": 3, "name": "updated benoit" }
829            ]))
830            .unwrap();
831
832        // Check that there is **always** 3 documents.
833        let rtxn = index.read_txn().unwrap();
834        let count = index.number_of_documents(&rtxn).unwrap();
835        assert_eq!(count, 3);
836        let count = index.all_documents(&rtxn).unwrap().count();
837        assert_eq!(count, 3);
838
839        drop(rtxn);
840    }
841
842    #[test]
843    fn simple_document_merge() {
844        let mut index = TempIndex::new();
845        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
846
847        // First we send 3 documents with duplicate ids and
848        // change the index method to merge documents.
849        index
850            .add_documents(documents!([
851                { "id": 1, "name": "kevin" },
852                { "id": 1, "name": "kevina" },
853                { "id": 1, "name": "benoit" }
854            ]))
855            .unwrap();
856
857        // Check that there is only 1 document now.
858        let rtxn = index.read_txn().unwrap();
859        let count = index.number_of_documents(&rtxn).unwrap();
860        assert_eq!(count, 1);
861
862        // Check that we get only one document from the database.
863        let docs = index.documents(&rtxn, Some(0)).unwrap();
864        assert_eq!(docs.len(), 1);
865        let (id, doc) = docs[0];
866        assert_eq!(id, 0);
867
868        // Check that this document is equal to the last one sent.
869        let mut doc_iter = doc.iter();
870        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
871        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
872        assert_eq!(doc_iter.next(), None);
873        drop(rtxn);
874
875        // Second we send 1 document with id 1, to force it to be merged with the previous one.
876        index.add_documents(documents!([ { "id": 1, "age": 25 } ])).unwrap();
877
878        // Check that there is **always** 1 document.
879        let rtxn = index.read_txn().unwrap();
880        let count = index.number_of_documents(&rtxn).unwrap();
881        assert_eq!(count, 1);
882
883        // Check that we get only one document from the database.
884        let docs = index.documents(&rtxn, Some(0)).unwrap();
885        assert_eq!(docs.len(), 1);
886        let (id, doc) = docs[0];
887        assert_eq!(id, 0);
888
889        // Check that this document is equal to the last one sent.
890        let mut doc_iter = doc.iter();
891        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
892        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
893        assert_eq!(doc_iter.next(), Some((2, &b"25"[..])));
894        assert_eq!(doc_iter.next(), None);
895        drop(rtxn);
896    }
897
898    #[test]
899    fn empty_update() {
900        let index = TempIndex::new();
901
902        // First we send 0 documents and only headers.
903        index.add_documents(documents!([])).unwrap();
904
905        // Check that there is no documents.
906        let rtxn = index.read_txn().unwrap();
907        let count = index.number_of_documents(&rtxn).unwrap();
908        assert_eq!(count, 0);
909        drop(rtxn);
910    }
911
912    #[test]
913    fn invalid_documents_ids() {
914        let index = TempIndex::new();
915
916        // First we send 1 document with an invalid id.
917        // There is a space in the document id.
918        index.add_documents(documents!([ { "id": "brume bleue", "name": "kevin" } ])).unwrap_err();
919
920        // Then we send 1 document with a valid id.
921        index.add_documents(documents!([ { "id": 32, "name": "kevin" } ])).unwrap();
922
923        // Check that there is 1 document now.
924        let rtxn = index.read_txn().unwrap();
925        let count = index.number_of_documents(&rtxn).unwrap();
926        assert_eq!(count, 1);
927        drop(rtxn);
928    }
929
930    #[test]
931    fn complex_documents() {
932        let index = TempIndex::new();
933
934        // First we send 3 documents with an id for only one of them.
935        index
936            .add_documents(documents!([
937                { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
938                { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
939                { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
940            ]))
941            .unwrap();
942
943        // Check that there is 1 documents now.
944        let rtxn = index.read_txn().unwrap();
945
946        // Search for a sub object value
947        let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
948        assert_eq!(result.documents_ids, vec![0]);
949
950        // Search for a sub array value
951        let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
952        assert_eq!(result.documents_ids, vec![1]);
953
954        // Search for a sub array sub object key
955        let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap();
956        assert_eq!(result.documents_ids, vec![2]);
957
958        drop(rtxn);
959    }
960
961    #[test]
962    fn simple_documents_replace() {
963        let mut index = TempIndex::new();
964        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
965
966        index.add_documents(documents!([
967          { "id": 2,    "title": "Pride and Prejudice",                    "author": "Jane Austin",              "genre": "romance",    "price": 3.5, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 42 } },
968          { "id": 456,  "title": "Le Petit Prince",                        "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
969          { "id": 1,    "title": "Alice In Wonderland",                    "author": "Lewis Carroll",            "genre": "fantasy",    "price": 25.99 },
970          { "id": 1344, "title": "The Hobbit",                             "author": "J. R. R. Tolkien",         "genre": "fantasy" },
971          { "id": 4,    "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling",            "genre": "fantasy" },
972          { "id": 42,   "title": "The Hitchhiker's Guide to the Galaxy",   "author": "Douglas Adams", RESERVED_GEO_FIELD_NAME: { "lat": 35, "lng": 23 } }
973        ])).unwrap();
974
975        db_snap!(index, word_docids, "initial");
976
977        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
978
979        index
980            .add_documents(documents!([
981                {"id":4,"title":"Harry Potter and the Half-Blood Princess"},
982                {"id":456,"title":"The Little Prince"}
983            ]))
984            .unwrap();
985
986        index
987            .add_documents(documents!([
988                { "id": 2, "author": "J. Austen", "date": "1813" }
989            ]))
990            .unwrap();
991
992        // Check that there is **always** 6 documents.
993        let rtxn = index.read_txn().unwrap();
994        let count = index.number_of_documents(&rtxn).unwrap();
995        assert_eq!(count, 6);
996        let count = index.all_documents(&rtxn).unwrap().count();
997        assert_eq!(count, 6);
998
999        db_snap!(index, word_docids, "updated");
1000
1001        drop(rtxn);
1002    }
1003
1004    #[test]
1005    fn mixed_geo_documents() {
1006        let mut index = TempIndex::new();
1007        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
1008
1009        // We send 6 documents and mix the ones that have _geo and those that don't have it.
1010        index
1011            .add_documents(documents!([
1012              { "id": 2, "price": 3.5, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 42 } },
1013              { "id": 456 },
1014              { "id": 1 },
1015              { "id": 1344 },
1016              { "id": 4 },
1017              { "id": 42, RESERVED_GEO_FIELD_NAME: { "lat": 35, "lng": 23 } }
1018            ]))
1019            .unwrap();
1020
1021        index
1022            .update_settings(|settings| {
1023                settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
1024                    RESERVED_GEO_FIELD_NAME.to_string(),
1025                )]);
1026            })
1027            .unwrap();
1028    }
1029
1030    #[test]
1031    fn geo_error() {
1032        let mut index = TempIndex::new();
1033        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
1034
1035        index
1036            .update_settings(|settings| {
1037                settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
1038                    RESERVED_GEO_FIELD_NAME.to_string(),
1039                )]);
1040            })
1041            .unwrap();
1042
1043        let error = index
1044            .add_documents(documents!([
1045              { "id": 0, RESERVED_GEO_FIELD_NAME: { "lng": 42 } }
1046            ]))
1047            .unwrap_err();
1048        assert_eq!(
1049            &error.to_string(),
1050            r#"Could not find latitude in the document with the id: `"0"`. Was expecting a `_geo.lat` field."#
1051        );
1052
1053        let error = index
1054            .add_documents(documents!([
1055              { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 42 } }
1056            ]))
1057            .unwrap_err();
1058        assert_eq!(
1059            &error.to_string(),
1060            r#"Could not find longitude in the document with the id: `"0"`. Was expecting a `_geo.lng` field."#
1061        );
1062
1063        let error = index
1064            .add_documents(documents!([
1065              { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": "lol", "lng": 42 } }
1066            ]))
1067            .unwrap_err();
1068        assert_eq!(
1069            &error.to_string(),
1070            r#"Could not parse latitude in the document with the id: `"0"`. Was expecting a finite number but instead got `"lol"`."#
1071        );
1072
1073        let error = index
1074            .add_documents(documents!([
1075              { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": [12, 13], "lng": 42 } }
1076            ]))
1077            .unwrap_err();
1078        assert_eq!(
1079            &error.to_string(),
1080            r#"Could not parse latitude in the document with the id: `"0"`. Was expecting a finite number but instead got `[12,13]`."#
1081        );
1082
1083        let error = index
1084            .add_documents(documents!([
1085              { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": "hello" } }
1086            ]))
1087            .unwrap_err();
1088        assert_eq!(
1089            &error.to_string(),
1090            r#"Could not parse longitude in the document with the id: `"0"`. Was expecting a finite number but instead got `"hello"`."#
1091        );
1092    }
1093
1094    #[test]
1095    fn delete_documents_then_insert() {
1096        let index = TempIndex::new();
1097
1098        index
1099            .add_documents(documents!([
1100                { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
1101                { "objectId": 456, "title": "Le Petit Prince",     "comment": "A french book" },
1102                { "objectId": 1,   "title": "Alice In Wonderland", "comment": "A weird book" },
1103                { "objectId": 30,  "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } }
1104            ]))
1105            .unwrap();
1106
1107        // Delete not all of the documents but some of them.
1108        index.delete_document("30");
1109
1110        let txn = index.read_txn().unwrap();
1111        assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
1112
1113        let external_documents_ids = index.external_documents_ids();
1114        assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
1115
1116        index
1117            .add_documents(documents!([
1118                { "objectId": 30,  "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } }
1119            ]))
1120            .unwrap();
1121
1122        let wtxn = index.write_txn().unwrap();
1123        let external_documents_ids = index.external_documents_ids();
1124        assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
1125        wtxn.commit().unwrap();
1126
1127        index
1128            .add_documents(documents!([
1129                { "objectId": 30,  "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } }
1130            ]))
1131            .unwrap();
1132    }
1133
1134    #[test]
1135    fn index_more_than_256_fields() {
1136        let index = TempIndex::new();
1137
1138        let mut big_object = serde_json::Map::new();
1139        big_object.insert(S("id"), serde_json::Value::from("wow"));
1140        for i in 0..1000 {
1141            let key = i.to_string();
1142            big_object.insert(key, serde_json::Value::from("I am a text!"));
1143        }
1144
1145        let documents = mmap_from_objects([big_object]);
1146        index.add_documents(documents).unwrap();
1147    }
1148
1149    #[test]
1150    fn index_more_than_1000_positions_in_a_field() {
1151        let index = TempIndex::new_with_map_size(4096 * 100_000); // 400 MB
1152        let mut content = String::with_capacity(382101);
1153        for i in 0..=u16::MAX {
1154            content.push_str(&format!("{i} "));
1155        }
1156        index
1157            .add_documents(documents!({
1158                "id": "wow",
1159                "content": content
1160            }))
1161            .unwrap();
1162
1163        let rtxn = index.read_txn().unwrap();
1164
1165        assert!(index.word_docids.get(&rtxn, "0").unwrap().is_some());
1166        assert!(index.word_docids.get(&rtxn, "64").unwrap().is_some());
1167        assert!(index.word_docids.get(&rtxn, "256").unwrap().is_some());
1168        assert!(index.word_docids.get(&rtxn, "1024").unwrap().is_some());
1169        assert!(index.word_docids.get(&rtxn, "32768").unwrap().is_some());
1170        assert!(index.word_docids.get(&rtxn, "65535").unwrap().is_some());
1171    }
1172
1173    #[test]
1174    fn index_documents_with_zeroes() {
1175        let index = TempIndex::new();
1176
1177        index
1178            .add_documents(documents!([
1179                {
1180                    "id": 2,
1181                    "title": "Prideand Prejudice",
1182                    "au{hor": "Jane Austin",
1183                    "genre": "romance",
1184                    "price$": "3.5$",
1185                },
1186                {
1187                    "id": 456,
1188                    "title": "Le Petit Prince",
1189                    "au{hor": "Antoine de Saint-Exupéry",
1190                    "genre": "adventure",
1191                    "price$": "10.0$",
1192                },
1193                {
1194                    "id": 1,
1195                    "title": "Wonderland",
1196                    "au{hor": "Lewis Carroll",
1197                    "genre": "fantasy",
1198                    "price$": "25.99$",
1199                },
1200                {
1201                    "id": 4,
1202                    "title": "Harry Potter ing fantasy\0lood Prince",
1203                    "au{hor": "J. K. Rowling",
1204                    "genre": "fantasy\0",
1205                },
1206            ]))
1207            .unwrap();
1208    }
1209
1210    #[test]
1211    fn index_documents_with_nested_fields() {
1212        let index = TempIndex::new();
1213
1214        index
1215            .add_documents(documents!([
1216                {
1217                    "id": 0,
1218                    "title": "The zeroth document",
1219                },
1220                {
1221                    "id": 1,
1222                    "title": "The first document",
1223                    "nested": {
1224                        "object": "field",
1225                        "machin": "bidule",
1226                    },
1227                },
1228                {
1229                    "id": 2,
1230                    "title": "The second document",
1231                    "nested": [
1232                        "array",
1233                        {
1234                            "object": "field",
1235                        },
1236                        {
1237                            "prout": "truc",
1238                            "machin": "lol",
1239                        },
1240                    ],
1241                },
1242                {
1243                    "id": 3,
1244                    "title": "The third document",
1245                    "nested": "I lied",
1246                },
1247            ]))
1248            .unwrap();
1249
1250        index
1251            .update_settings(|settings| {
1252                let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")];
1253                settings.set_searchable_fields(searchable_fields);
1254
1255                let faceted_fields = vec![
1256                    FilterableAttributesRule::Field("title".to_string()),
1257                    FilterableAttributesRule::Field("nested.object".to_string()),
1258                    FilterableAttributesRule::Field("nested.machin".to_string()),
1259                ];
1260                settings.set_filterable_fields(faceted_fields);
1261            })
1262            .unwrap();
1263
1264        let rtxn = index.read_txn().unwrap();
1265
1266        // testing the simple query search
1267        let mut search = crate::Search::new(&rtxn, &index);
1268        search.query("document");
1269        search.terms_matching_strategy(TermsMatchingStrategy::default());
1270        // all documents should be returned
1271        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1272        assert_eq!(documents_ids.len(), 4);
1273
1274        search.query("zeroth");
1275        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1276        assert_eq!(documents_ids, vec![0]);
1277        search.query("first");
1278        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1279        assert_eq!(documents_ids, vec![1]);
1280        search.query("second");
1281        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1282        assert_eq!(documents_ids, vec![2]);
1283        search.query("third");
1284        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1285        assert_eq!(documents_ids, vec![3]);
1286
1287        search.query("field");
1288        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1289        assert_eq!(documents_ids, vec![1, 2]);
1290
1291        search.query("lol");
1292        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1293        assert_eq!(documents_ids, vec![2]);
1294
1295        search.query("object");
1296        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1297        assert!(documents_ids.is_empty());
1298
1299        search.query("array");
1300        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1301        assert!(documents_ids.is_empty()); // nested is not searchable
1302
1303        search.query("lied");
1304        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1305        assert!(documents_ids.is_empty()); // nested is not searchable
1306
1307        // testing the filters
1308        let mut search = crate::Search::new(&rtxn, &index);
1309        search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap());
1310        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1311        assert_eq!(documents_ids, vec![1]);
1312
1313        search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap());
1314        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1315        assert_eq!(documents_ids, vec![1, 2]);
1316
1317        search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap());
1318        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1319        assert_eq!(documents_ids, vec![1]);
1320
1321        search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap());
1322        let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
1323        assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
1324
1325        search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap());
1326        let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
1327        assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
1328    }
1329
1330    #[test]
1331    fn index_documents_with_nested_primary_key() {
1332        let index = TempIndex::new();
1333
1334        index
1335            .update_settings(|settings| {
1336                settings.set_primary_key("complex.nested.id".to_owned());
1337            })
1338            .unwrap();
1339
1340        index
1341            .add_documents(documents!([
1342                {
1343                    "complex": {
1344                        "nested": {
1345                            "id": 0,
1346                        },
1347                    },
1348                    "title": "The zeroth document",
1349                },
1350                {
1351                    "complex.nested": {
1352                        "id": 1,
1353                    },
1354                    "title": "The first document",
1355                },
1356                {
1357                    "complex": {
1358                        "nested.id": 2,
1359                    },
1360                    "title": "The second document",
1361                },
1362                {
1363                    "complex.nested.id": 3,
1364                    "title": "The third document",
1365                },
1366            ]))
1367            .unwrap();
1368
1369        let rtxn = index.read_txn().unwrap();
1370
1371        // testing the simple query search
1372        let mut search = crate::Search::new(&rtxn, &index);
1373        search.query("document");
1374        search.terms_matching_strategy(TermsMatchingStrategy::default());
1375        // all documents should be returned
1376        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1377        assert_eq!(documents_ids.len(), 4);
1378
1379        search.query("zeroth");
1380        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1381        assert_eq!(documents_ids, vec![0]);
1382        search.query("first");
1383        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1384        assert_eq!(documents_ids, vec![1]);
1385        search.query("second");
1386        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1387        assert_eq!(documents_ids, vec![2]);
1388        search.query("third");
1389        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1390        assert_eq!(documents_ids, vec![3]);
1391    }
1392
1393    #[test]
1394    fn retrieve_a_b_nested_document_id() {
1395        let index = TempIndex::new();
1396
1397        index
1398            .update_settings(|settings| {
1399                settings.set_primary_key("a.b".to_owned());
1400            })
1401            .unwrap();
1402
1403        // There must be an issue with the primary key no present in the given document
1404        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap_err();
1405    }
1406
1407    #[test]
1408    fn retrieve_a_b_c_nested_document_id() {
1409        let index = TempIndex::new();
1410
1411        index
1412            .update_settings(|settings| {
1413                settings.set_primary_key("a.b.c".to_owned());
1414            })
1415            .unwrap();
1416        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap();
1417
1418        let rtxn = index.read_txn().unwrap();
1419        let all_documents_count = index.all_documents(&rtxn).unwrap().count();
1420        assert_eq!(all_documents_count, 1);
1421        let external_documents_ids = index.external_documents_ids();
1422        assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
1423    }
1424
1425    #[test]
1426    fn test_facets_generation() {
1427        let index = TempIndex::new();
1428
1429        index
1430            .add_documents(documents!([
1431                {
1432                    "id": 0,
1433                    "dog": {
1434                        "race": {
1435                            "bernese mountain": "zeroth",
1436                        },
1437                    },
1438                },
1439                {
1440                    "id": 1,
1441                    "dog.race": {
1442                        "bernese mountain": "first",
1443                    },
1444                },
1445                {
1446                    "id": 2,
1447                    "dog.race.bernese mountain": "second",
1448                },
1449                {
1450                    "id": 3,
1451                    "dog": {
1452                        "race.bernese mountain": "third"
1453                    },
1454                },
1455            ]))
1456            .unwrap();
1457
1458        index
1459            .update_settings(|settings| {
1460                settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
1461                    "dog".to_string(),
1462                )]);
1463            })
1464            .unwrap();
1465
1466        db_snap!(index, facet_id_string_docids, @r###"
1467        3   0  first        1  [1, ]
1468        3   0  second       1  [2, ]
1469        3   0  third        1  [3, ]
1470        3   0  zeroth       1  [0, ]
1471        "###);
1472        db_snap!(index, field_id_docid_facet_strings, @r###"
1473        3   0    zeroth       zeroth
1474        3   1    first        first
1475        3   2    second       second
1476        3   3    third        third
1477        "###);
1478
1479        let rtxn = index.read_txn().unwrap();
1480
1481        for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] {
1482            let mut search = crate::Search::new(&rtxn, &index);
1483            let filter = format!(r#""dog.race.bernese mountain" = {s}"#);
1484            search.filter(crate::Filter::from_str(&filter).unwrap().unwrap());
1485            let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1486            assert_eq!(documents_ids, vec![i]);
1487        }
1488        // Reset the settings
1489        index
1490            .update_settings(|settings| {
1491                settings.reset_filterable_fields();
1492            })
1493            .unwrap();
1494
1495        db_snap!(index, facet_id_string_docids, @"");
1496        db_snap!(index, field_id_docid_facet_strings, @"");
1497
1498        // update the settings to test the sortable
1499        index
1500            .update_settings(|settings| {
1501                settings.set_sortable_fields(hashset!(S("dog.race")));
1502            })
1503            .unwrap();
1504
1505        db_snap!(index, facet_id_string_docids, @r###"
1506        3   0  first        1  [1, ]
1507        3   0  second       1  [2, ]
1508        3   0  third        1  [3, ]
1509        3   0  zeroth       1  [0, ]
1510        "###);
1511        db_snap!(index, field_id_docid_facet_strings, @r###"
1512        3   0    zeroth       zeroth
1513        3   1    first        first
1514        3   2    second       second
1515        3   3    third        third
1516        "###);
1517
1518        let rtxn = index.read_txn().unwrap();
1519
1520        let mut search = crate::Search::new(&rtxn, &index);
1521        search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S(
1522            "dog.race.bernese mountain",
1523        )))]);
1524        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1525        assert_eq!(documents_ids, vec![1, 2, 3, 0]);
1526    }
1527
1528    #[test]
1529    fn index_2_times_documents_split_by_zero_document_indexation() {
1530        let index = TempIndex::new();
1531
1532        index
1533            .add_documents(documents!([
1534                {"id": 0, "name": "Kerollmops", "score": 78},
1535                {"id": 1, "name": "ManyTheFish", "score": 75},
1536                {"id": 2, "name": "Ferdi", "score": 39},
1537                {"id": 3, "name": "Tommy", "score": 33}
1538            ]))
1539            .unwrap();
1540
1541        // Check that there is 4 document now.
1542        let rtxn = index.read_txn().unwrap();
1543        let count = index.number_of_documents(&rtxn).unwrap();
1544        assert_eq!(count, 4);
1545
1546        index.add_documents(documents!([])).unwrap();
1547
1548        // Check that there is 4 document now.
1549        let rtxn = index.read_txn().unwrap();
1550        let count = index.number_of_documents(&rtxn).unwrap();
1551        assert_eq!(count, 4);
1552
1553        index
1554            .add_documents(documents!([
1555                {"id": 0, "name": "Kerollmops", "score": 78},
1556                {"id": 1, "name": "ManyTheFish", "score": 75},
1557                {"id": 2, "name": "Ferdi", "score": 39},
1558                {"id": 3, "name": "Tommy", "score": 33}
1559            ]))
1560            .unwrap();
1561
1562        // Check that there is 4 document now.
1563        let rtxn = index.read_txn().unwrap();
1564        let count = index.number_of_documents(&rtxn).unwrap();
1565        assert_eq!(count, 4);
1566    }
1567
1568    #[cfg(feature = "chinese")]
1569    #[test]
1570    fn test_meilisearch_1714() {
1571        let index = TempIndex::new();
1572
1573        index
1574            .add_documents(documents!([
1575              {"id": "123", "title": "小化妆包" },
1576              {"id": "456", "title": "Ipad 包" }
1577            ]))
1578            .unwrap();
1579
1580        let rtxn = index.read_txn().unwrap();
1581
1582        // Only the first document should match.
1583        let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
1584        assert_eq!(count, 1);
1585
1586        // Only the second document should match.
1587        let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
1588        assert_eq!(count, 1);
1589
1590        let mut search = crate::Search::new(&rtxn, &index);
1591        search.query("化妆包");
1592        search.terms_matching_strategy(TermsMatchingStrategy::default());
1593
1594        // only 1 document should be returned
1595        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1596        assert_eq!(documents_ids.len(), 1);
1597    }
1598
1599    /// We try to index documents with words that are too long here,
1600    /// it should not return any error.
1601    #[test]
1602    fn text_with_too_long_words() {
1603        let index = TempIndex::new();
1604
1605        index
1606            .add_documents(documents!([
1607              {"id": 1, "title": "a".repeat(256) },
1608              {"id": 2, "title": "b".repeat(512) },
1609              {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) },
1610            ]))
1611            .unwrap();
1612    }
1613
1614    #[test]
1615    fn text_with_too_long_keys() {
1616        let index = TempIndex::new();
1617        let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3";
1618
1619        // Create 200 documents with a long text
1620        let content = {
1621            let documents_iter = (0..200i32)
1622                .map(|i| serde_json::json!({ "id": i, "script": script }))
1623                .filter_map(|json| match json {
1624                    serde_json::Value::Object(object) => Some(object),
1625                    _ => None,
1626                });
1627            mmap_from_objects(documents_iter)
1628        };
1629        // Index those 200 long documents
1630        index.add_documents(content).unwrap();
1631
1632        // Index one long document
1633        index
1634            .add_documents(documents!([
1635              {"id": 400, "script": script },
1636            ]))
1637            .unwrap();
1638    }
1639
1640    #[test]
1641    fn index_documents_in_multiple_transforms() {
1642        let index = TempIndex::new();
1643
1644        let doc1 = documents! {[{
1645            "id": 228142,
1646            "title": "asdsad",
1647            "state": "automated",
1648            "priority": "normal",
1649            "public_uid": "37ccf021",
1650            "project_id": 78207,
1651            "branch_id_number": 0
1652        }]};
1653
1654        let doc2 = documents! {[{
1655            "id": 228143,
1656            "title": "something",
1657            "state": "automated",
1658            "priority": "normal",
1659            "public_uid": "39c6499b",
1660            "project_id": 78207,
1661            "branch_id_number": 0
1662        }]};
1663
1664        {
1665            let mut wtxn = index.write_txn().unwrap();
1666            index.put_primary_key(&mut wtxn, "id").unwrap();
1667            wtxn.commit().unwrap();
1668        }
1669
1670        index.add_documents(doc1).unwrap();
1671        index.add_documents(doc2).unwrap();
1672
1673        let rtxn = index.read_txn().unwrap();
1674
1675        let map = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
1676        let ids = map.values().collect::<HashSet<_>>();
1677
1678        assert_eq!(ids.len(), map.len());
1679    }
1680
1681    #[test]
1682    fn index_documents_check_exists_database() {
1683        let content = || {
1684            documents!([
1685                {
1686                    "id": 0,
1687                    "colour": 0,
1688                },
1689                {
1690                    "id": 1,
1691                    "colour": []
1692                },
1693                {
1694                    "id": 2,
1695                    "colour": {}
1696                },
1697                {
1698                    "id": 3,
1699                    "colour": null
1700                },
1701                {
1702                    "id": 4,
1703                    "colour": [1]
1704                },
1705                {
1706                    "id": 5
1707                },
1708                {
1709                    "id": 6,
1710                    "colour": {
1711                        "green": 1
1712                    }
1713                },
1714                {
1715                    "id": 7,
1716                    "colour": {
1717                        "green": {
1718                            "blue": []
1719                        }
1720                    }
1721                }
1722            ])
1723        };
1724
1725        let check_ok = |index: &Index| {
1726            let rtxn = index.read_txn().unwrap();
1727
1728            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
1729            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
1730            let colour_green_blue_id =
1731                index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
1732
1733            let bitmap_colour =
1734                index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
1735            assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
1736
1737            let bitmap_colour_green =
1738                index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
1739            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
1740
1741            let bitmap_colour_blue =
1742                index.facet_id_exists_docids.get(&rtxn, &colour_green_blue_id).unwrap().unwrap();
1743            assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![7]);
1744        };
1745
1746        let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())];
1747
1748        let index = TempIndex::new();
1749        index.add_documents(content()).unwrap();
1750        index
1751            .update_settings(|settings| {
1752                settings.set_filterable_fields(faceted_fields.clone());
1753            })
1754            .unwrap();
1755        check_ok(&index);
1756
1757        let index = TempIndex::new();
1758        index
1759            .update_settings(|settings| {
1760                settings.set_filterable_fields(faceted_fields.clone());
1761            })
1762            .unwrap();
1763        index.add_documents(content()).unwrap();
1764        check_ok(&index);
1765    }
1766
1767    #[test]
1768    fn index_documents_check_is_null_database() {
1769        let content = || {
1770            documents!([
1771                {
1772                    "id": 0,
1773                    "colour": null,
1774                },
1775                {
1776                    "id": 1,
1777                    "colour": [null], // must not be returned
1778                },
1779                {
1780                    "id": 6,
1781                    "colour": {
1782                        "green": null
1783                    }
1784                },
1785                {
1786                    "id": 7,
1787                    "colour": {
1788                        "green": {
1789                            "blue": null
1790                        }
1791                    }
1792                },
1793                {
1794                    "id": 8,
1795                    "colour": 0,
1796                },
1797                {
1798                    "id": 9,
1799                    "colour": []
1800                },
1801                {
1802                    "id": 10,
1803                    "colour": {}
1804                },
1805                {
1806                    "id": 12,
1807                    "colour": [1]
1808                },
1809                {
1810                    "id": 13
1811                },
1812                {
1813                    "id": 14,
1814                    "colour": {
1815                        "green": 1
1816                    }
1817                },
1818                {
1819                    "id": 15,
1820                    "colour": {
1821                        "green": {
1822                            "blue": []
1823                        }
1824                    }
1825                }
1826            ])
1827        };
1828
1829        let check_ok = |index: &Index| {
1830            let rtxn = index.read_txn().unwrap();
1831
1832            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
1833            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
1834            let colour_blue_id =
1835                index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
1836
1837            let bitmap_null_colour =
1838                index.facet_id_is_null_docids.get(&rtxn, &colour_id).unwrap().unwrap();
1839            assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
1840
1841            let bitmap_colour_green =
1842                index.facet_id_is_null_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
1843            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
1844
1845            let bitmap_colour_blue =
1846                index.facet_id_is_null_docids.get(&rtxn, &colour_blue_id).unwrap().unwrap();
1847            assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
1848        };
1849
1850        let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())];
1851
1852        let index = TempIndex::new();
1853        index.add_documents(content()).unwrap();
1854        index
1855            .update_settings(|settings| {
1856                settings.set_filterable_fields(faceted_fields.clone());
1857            })
1858            .unwrap();
1859        check_ok(&index);
1860
1861        let index = TempIndex::new();
1862        index
1863            .update_settings(|settings| {
1864                settings.set_filterable_fields(faceted_fields.clone());
1865            })
1866            .unwrap();
1867        index.add_documents(content()).unwrap();
1868        check_ok(&index);
1869    }
1870
1871    #[test]
1872    fn index_documents_check_is_empty_database() {
1873        let content = || {
1874            documents!([
1875                {"id": 0, "tags": null },
1876                {"id": 1, "tags": [null] },
1877                {"id": 2, "tags": [] },
1878                {"id": 3, "tags": ["hello","world"] },
1879                {"id": 4, "tags": [""] },
1880                {"id": 5 },
1881                {"id": 6, "tags": {} },
1882                {"id": 7, "tags": {"green": "cool"} },
1883                {"id": 8, "tags": {"green": ""} },
1884                {"id": 9, "tags": "" },
1885                {"id": 10, "tags": { "green": null } },
1886                {"id": 11, "tags": { "green": { "blue": null } } },
1887                {"id": 12, "tags": { "green": { "blue": [] } } }
1888            ])
1889        };
1890
1891        let check_ok = |index: &Index| {
1892            let rtxn = index.read_txn().unwrap();
1893
1894            let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
1895            let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
1896            let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
1897
1898            let bitmap_empty_tags =
1899                index.facet_id_is_empty_docids.get(&rtxn, &tags_id).unwrap().unwrap();
1900            assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
1901
1902            let bitmap_tags_green =
1903                index.facet_id_is_empty_docids.get(&rtxn, &tags_green_id).unwrap().unwrap();
1904            assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
1905
1906            let bitmap_tags_blue =
1907                index.facet_id_is_empty_docids.get(&rtxn, &tags_blue_id).unwrap().unwrap();
1908            assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
1909        };
1910
1911        let faceted_fields = vec![FilterableAttributesRule::Field("tags".to_string())];
1912
1913        let index = TempIndex::new();
1914        index.add_documents(content()).unwrap();
1915        index
1916            .update_settings(|settings| {
1917                settings.set_filterable_fields(faceted_fields.clone());
1918            })
1919            .unwrap();
1920        check_ok(&index);
1921
1922        let index = TempIndex::new();
1923        index
1924            .update_settings(|settings| {
1925                settings.set_filterable_fields(faceted_fields.clone());
1926            })
1927            .unwrap();
1928        index.add_documents(content()).unwrap();
1929        check_ok(&index);
1930    }
1931
1932    #[test]
1933    fn primary_key_must_not_contain_floats() {
1934        let index = TempIndex::new_with_map_size(4096 * 100);
1935
1936        let doc1 = documents! {[{
1937            "id": -228142,
1938            "title": "asdsad",
1939        }]};
1940
1941        let doc2 = documents! {[{
1942            "id": 228143.56,
1943            "title": "something",
1944        }]};
1945
1946        let doc3 = documents! {[{
1947            "id": -228143.56,
1948            "title": "something",
1949        }]};
1950
1951        let doc4 = documents! {[{
1952            "id": 2.0,
1953            "title": "something",
1954        }]};
1955
1956        let rtxn = index.inner.read_txn().unwrap();
1957        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
1958        let mut new_fields_ids_map = db_fields_ids_map.clone();
1959
1960        let mut indexer = indexer::DocumentOperation::new();
1961        indexer.replace_documents(&doc1).unwrap();
1962        indexer.replace_documents(&doc2).unwrap();
1963        indexer.replace_documents(&doc3).unwrap();
1964        indexer.replace_documents(&doc4).unwrap();
1965
1966        let indexer_alloc = Bump::new();
1967        let (_document_changes, operation_stats, _primary_key) = indexer
1968            .into_changes(
1969                &indexer_alloc,
1970                &index.inner,
1971                &rtxn,
1972                None,
1973                &mut new_fields_ids_map,
1974                &|| false,
1975                Progress::default(),
1976            )
1977            .unwrap();
1978
1979        assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 1);
1980        assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_some()).count(), 3);
1981    }
1982
1983    #[test]
1984    fn mixing_documents_replace_with_updates() {
1985        let index = TempIndex::new_with_map_size(4096 * 100);
1986
1987        let doc1 = documents! {[{
1988            "id": 1,
1989            "title": "asdsad",
1990            "description": "Wat wat wat, wat"
1991        }]};
1992
1993        let doc2 = documents! {[{
1994            "id": 1,
1995            "title": "something",
1996        }]};
1997
1998        let doc3 = documents! {[{
1999            "id": 1,
2000            "title": "another something",
2001        }]};
2002
2003        let doc4 = documents! {[{
2004            "id": 1,
2005            "description": "This is it!",
2006        }]};
2007
2008        let rtxn = index.inner.read_txn().unwrap();
2009        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2010        let mut new_fields_ids_map = db_fields_ids_map.clone();
2011
2012        let mut indexer = indexer::DocumentOperation::new();
2013        indexer.replace_documents(&doc1).unwrap();
2014        indexer.update_documents(&doc2).unwrap();
2015        indexer.update_documents(&doc3).unwrap();
2016        indexer.update_documents(&doc4).unwrap();
2017
2018        let indexer_alloc = Bump::new();
2019        let (document_changes, operation_stats, primary_key) = indexer
2020            .into_changes(
2021                &indexer_alloc,
2022                &index.inner,
2023                &rtxn,
2024                None,
2025                &mut new_fields_ids_map,
2026                &|| false,
2027                Progress::default(),
2028            )
2029            .unwrap();
2030
2031        assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 4);
2032
2033        let mut wtxn = index.write_txn().unwrap();
2034        indexer::index(
2035            &mut wtxn,
2036            &index.inner,
2037            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2038            index.indexer_config.grenad_parameters(),
2039            &db_fields_ids_map,
2040            new_fields_ids_map,
2041            primary_key,
2042            &document_changes,
2043            EmbeddingConfigs::default(),
2044            &|| false,
2045            &Progress::default(),
2046        )
2047        .unwrap();
2048        wtxn.commit().unwrap();
2049
2050        let rtxn = index.read_txn().unwrap();
2051        let obkv = index.document(&rtxn, 0).unwrap();
2052        let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
2053
2054        let json_document = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
2055        let expected = serde_json::json!({
2056            "id": 1,
2057            "title": "another something",
2058            "description": "This is it!",
2059        });
2060        let expected = expected.as_object().unwrap();
2061        assert_eq!(&json_document, expected);
2062    }
2063
2064    #[test]
2065    fn mixing_documents_replace_with_updates_even_more() {
2066        let index = TempIndex::new_with_map_size(4096 * 100);
2067
2068        let doc1 = documents! {[{
2069            "id": 1,
2070            "title": "asdsad",
2071            "description": "Wat wat wat, wat"
2072        }]};
2073
2074        let doc2 = documents! {[{
2075            "id": 1,
2076            "title": "something",
2077        }]};
2078
2079        let doc3 = documents! {[{
2080            "id": 1,
2081            "title": "another something",
2082        }]};
2083
2084        let doc4 = documents! {[{
2085            "id": 1,
2086            "title": "Woooof",
2087        }]};
2088
2089        let doc5 = documents! {[{
2090            "id": 1,
2091            "description": "This is it!",
2092        }]};
2093
2094        let rtxn = index.inner.read_txn().unwrap();
2095        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2096        let mut new_fields_ids_map = db_fields_ids_map.clone();
2097
2098        let mut indexer = indexer::DocumentOperation::new();
2099        indexer.replace_documents(&doc1).unwrap();
2100        indexer.update_documents(&doc2).unwrap();
2101        indexer.update_documents(&doc3).unwrap();
2102        indexer.replace_documents(&doc4).unwrap();
2103        indexer.update_documents(&doc5).unwrap();
2104
2105        let indexer_alloc = Bump::new();
2106        let (document_changes, operation_stats, primary_key) = indexer
2107            .into_changes(
2108                &indexer_alloc,
2109                &index.inner,
2110                &rtxn,
2111                None,
2112                &mut new_fields_ids_map,
2113                &|| false,
2114                Progress::default(),
2115            )
2116            .unwrap();
2117
2118        assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 5);
2119
2120        let mut wtxn = index.write_txn().unwrap();
2121        indexer::index(
2122            &mut wtxn,
2123            &index.inner,
2124            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2125            index.indexer_config.grenad_parameters(),
2126            &db_fields_ids_map,
2127            new_fields_ids_map,
2128            primary_key,
2129            &document_changes,
2130            EmbeddingConfigs::default(),
2131            &|| false,
2132            &Progress::default(),
2133        )
2134        .unwrap();
2135        wtxn.commit().unwrap();
2136
2137        let rtxn = index.read_txn().unwrap();
2138        let obkv = index.document(&rtxn, 0).unwrap();
2139        let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
2140
2141        let json_document = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
2142        let expected = serde_json::json!({
2143            "id": 1,
2144            "title": "Woooof",
2145            "description": "This is it!",
2146        });
2147        let expected = expected.as_object().unwrap();
2148        assert_eq!(&json_document, expected);
2149    }
2150
2151    #[test]
2152    fn primary_key_must_not_contain_whitespace() {
2153        let index = TempIndex::new();
2154
2155        let doc1 = documents! {[{
2156            "id": " 1",
2157            "title": "asdsad",
2158        }]};
2159
2160        let doc2 = documents! {[{
2161            "id": "\t2",
2162            "title": "something",
2163        }]};
2164
2165        let doc3 = documents! {[{
2166            "id": "\r3",
2167            "title": "something",
2168        }]};
2169
2170        let doc4 = documents! {[{
2171            "id": "\n4",
2172            "title": "something",
2173        }]};
2174
2175        index.add_documents(doc1).unwrap_err();
2176        index.add_documents(doc2).unwrap_err();
2177        index.add_documents(doc3).unwrap_err();
2178        index.add_documents(doc4).unwrap_err();
2179    }
2180
2181    #[test]
2182    fn primary_key_inference() {
2183        let index = TempIndex::new();
2184
2185        let doc_no_id = documents! {[{
2186            "title": "asdsad",
2187            "state": "automated",
2188            "priority": "normal",
2189            "branch_id_number": 0
2190        }]};
2191        assert!(matches!(
2192            index.add_documents(doc_no_id),
2193            Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
2194        ));
2195
2196        let doc_multiple_ids = documents! {[{
2197            "id": 228143,
2198            "title": "something",
2199            "state": "automated",
2200            "priority": "normal",
2201            "public_uid": "39c6499b",
2202            "project_id": 78207,
2203            "branch_id_number": 0
2204        }]};
2205
2206        let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { candidates })) =
2207            index.add_documents(doc_multiple_ids)
2208        else {
2209            panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)")
2210        };
2211
2212        assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
2213
2214        let doc_inferable = documents! {[{
2215            "video": "test.mp4",
2216            "id": 228143,
2217            "title": "something",
2218            "state": "automated",
2219            "priority": "normal",
2220            "public_uid_": "39c6499b",
2221            "project_id_": 78207,
2222            "branch_id_number": 0
2223        }]};
2224
2225        index.add_documents(doc_inferable).unwrap();
2226
2227        let txn = index.read_txn().unwrap();
2228
2229        assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
2230    }
2231
2232    #[test]
2233    fn long_words_must_be_skipped() {
2234        let index = TempIndex::new();
2235
2236        // this is obviousy too long
2237        let long_word = "lol".repeat(1000);
2238        let doc1 = documents! {[{
2239            "id": "1",
2240            "title": long_word,
2241        }]};
2242
2243        index.add_documents(doc1).unwrap();
2244
2245        let rtxn = index.read_txn().unwrap();
2246        let words_fst = index.words_fst(&rtxn).unwrap();
2247        assert!(!words_fst.contains(&long_word));
2248    }
2249
2250    #[test]
2251    fn long_facet_values_must_not_crash() {
2252        let index = TempIndex::new();
2253
2254        // this is obviousy too long
2255        let long_word = "lol".repeat(1000);
2256        let doc1 = documents! {[{
2257            "id": "1",
2258            "title": long_word,
2259        }]};
2260
2261        index
2262            .update_settings(|settings| {
2263                settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
2264                    "title".to_string(),
2265                )]);
2266            })
2267            .unwrap();
2268
2269        index.add_documents(doc1).unwrap();
2270    }
2271
2272    #[test]
2273    fn add_and_delete_documents_in_single_transform() {
2274        let mut index = TempIndex::new();
2275        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
2276
2277        let mut wtxn = index.write_txn().unwrap();
2278        let indexer_config = &index.indexer_config;
2279        let rtxn = index.inner.read_txn().unwrap();
2280        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2281        let mut new_fields_ids_map = db_fields_ids_map.clone();
2282
2283        let documents = documents!([
2284            { "id": 1, "doggo": "kevin" },
2285            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
2286            { "id": 3, "name": "jean", "age": 25 },
2287        ]);
2288
2289        let indexer_alloc = Bump::new();
2290        let embedders = EmbeddingConfigs::default();
2291        let mut indexer = indexer::DocumentOperation::new();
2292        indexer.replace_documents(&documents).unwrap();
2293        indexer.delete_documents(&["2"]);
2294        let (document_changes, _operation_stats, primary_key) = indexer
2295            .into_changes(
2296                &indexer_alloc,
2297                &index.inner,
2298                &rtxn,
2299                None,
2300                &mut new_fields_ids_map,
2301                &|| false,
2302                Progress::default(),
2303            )
2304            .unwrap();
2305
2306        indexer::index(
2307            &mut wtxn,
2308            &index.inner,
2309            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2310            indexer_config.grenad_parameters(),
2311            &db_fields_ids_map,
2312            new_fields_ids_map,
2313            primary_key,
2314            &document_changes,
2315            embedders,
2316            &|| false,
2317            &Progress::default(),
2318        )
2319        .unwrap();
2320        wtxn.commit().unwrap();
2321
2322        db_snap!(index, documents, @r###"
2323        {"id":1,"doggo":"kevin"}
2324        {"id":3,"name":"jean","age":25}
2325        "###);
2326    }
2327
2328    #[test]
2329    fn add_update_and_delete_documents_in_single_transform() {
2330        let mut index = TempIndex::new();
2331        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
2332
2333        let mut wtxn = index.write_txn().unwrap();
2334        let indexer_config = &index.indexer_config;
2335        let rtxn = index.inner.read_txn().unwrap();
2336        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2337        let mut new_fields_ids_map = db_fields_ids_map.clone();
2338
2339        let documents = documents!([
2340            { "id": 1, "doggo": "kevin" },
2341            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
2342            { "id": 3, "name": "jean", "age": 25 },
2343        ]);
2344        let mut indexer = indexer::DocumentOperation::new();
2345        indexer.update_documents(&documents).unwrap();
2346
2347        let documents = documents!([
2348            { "id": 2, "catto": "jorts" },
2349            { "id": 3, "legs": 4 },
2350        ]);
2351        indexer.update_documents(&documents).unwrap();
2352        indexer.delete_documents(&["1", "2"]);
2353
2354        let indexer_alloc = Bump::new();
2355        let embedders = EmbeddingConfigs::default();
2356        let (document_changes, _operation_stats, primary_key) = indexer
2357            .into_changes(
2358                &indexer_alloc,
2359                &index.inner,
2360                &rtxn,
2361                None,
2362                &mut new_fields_ids_map,
2363                &|| false,
2364                Progress::default(),
2365            )
2366            .unwrap();
2367
2368        indexer::index(
2369            &mut wtxn,
2370            &index.inner,
2371            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2372            indexer_config.grenad_parameters(),
2373            &db_fields_ids_map,
2374            new_fields_ids_map,
2375            primary_key,
2376            &document_changes,
2377            embedders,
2378            &|| false,
2379            &Progress::default(),
2380        )
2381        .unwrap();
2382        wtxn.commit().unwrap();
2383
2384        db_snap!(index, documents, @r###"
2385        {"id":3,"name":"jean","age":25,"legs":4}
2386        "###);
2387    }
2388
2389    #[test]
2390    fn add_document_and_in_another_transform_update_and_delete_documents() {
2391        let index = TempIndex::new();
2392
2393        let mut wtxn = index.write_txn().unwrap();
2394        let indexer_config = &index.indexer_config;
2395        let rtxn = index.inner.read_txn().unwrap();
2396        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2397        let mut new_fields_ids_map = db_fields_ids_map.clone();
2398
2399        let documents = documents!([
2400            { "id": 1, "doggo": "kevin" },
2401            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
2402            { "id": 3, "name": "jean", "age": 25 },
2403        ]);
2404        let indexer_alloc = Bump::new();
2405        let embedders = EmbeddingConfigs::default();
2406        let mut indexer = indexer::DocumentOperation::new();
2407        indexer.update_documents(&documents).unwrap();
2408
2409        let (document_changes, _operation_stats, primary_key) = indexer
2410            .into_changes(
2411                &indexer_alloc,
2412                &index.inner,
2413                &rtxn,
2414                None,
2415                &mut new_fields_ids_map,
2416                &|| false,
2417                Progress::default(),
2418            )
2419            .unwrap();
2420
2421        indexer::index(
2422            &mut wtxn,
2423            &index.inner,
2424            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2425            indexer_config.grenad_parameters(),
2426            &db_fields_ids_map,
2427            new_fields_ids_map,
2428            primary_key,
2429            &document_changes,
2430            embedders,
2431            &|| false,
2432            &Progress::default(),
2433        )
2434        .unwrap();
2435        wtxn.commit().unwrap();
2436
2437        db_snap!(index, documents, @r###"
2438        {"id":1,"doggo":"kevin"}
2439        {"id":2,"doggo":{"name":"bob","age":20}}
2440        {"id":3,"name":"jean","age":25}
2441        "###);
2442
2443        // A first batch of documents has been inserted
2444
2445        let mut wtxn = index.write_txn().unwrap();
2446        let indexer_config = &index.indexer_config;
2447        let rtxn = index.inner.read_txn().unwrap();
2448        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2449        let mut new_fields_ids_map = db_fields_ids_map.clone();
2450
2451        let documents = documents!([
2452            { "id": 2, "catto": "jorts" },
2453            { "id": 3, "legs": 4 },
2454        ]);
2455        let indexer_alloc = Bump::new();
2456        let embedders = EmbeddingConfigs::default();
2457        let mut indexer = indexer::DocumentOperation::new();
2458        indexer.update_documents(&documents).unwrap();
2459        indexer.delete_documents(&["1", "2"]);
2460
2461        let (document_changes, _operation_stats, primary_key) = indexer
2462            .into_changes(
2463                &indexer_alloc,
2464                &index.inner,
2465                &rtxn,
2466                None,
2467                &mut new_fields_ids_map,
2468                &|| false,
2469                Progress::default(),
2470            )
2471            .unwrap();
2472
2473        indexer::index(
2474            &mut wtxn,
2475            &index.inner,
2476            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2477            indexer_config.grenad_parameters(),
2478            &db_fields_ids_map,
2479            new_fields_ids_map,
2480            primary_key,
2481            &document_changes,
2482            embedders,
2483            &|| false,
2484            &Progress::default(),
2485        )
2486        .unwrap();
2487        wtxn.commit().unwrap();
2488
2489        db_snap!(index, documents, @r###"
2490        {"id":3,"name":"jean","age":25,"legs":4}
2491        "###);
2492    }
2493
2494    #[test]
2495    fn delete_document_and_then_add_documents_in_the_same_transform() {
2496        let index = TempIndex::new();
2497
2498        let mut wtxn = index.write_txn().unwrap();
2499        let indexer_config = &index.indexer_config;
2500        let rtxn = index.inner.read_txn().unwrap();
2501        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2502        let mut new_fields_ids_map = db_fields_ids_map.clone();
2503
2504        let indexer_alloc = Bump::new();
2505        let embedders = EmbeddingConfigs::default();
2506        let mut indexer = indexer::DocumentOperation::new();
2507        indexer.delete_documents(&["1", "2"]);
2508
2509        let documents = documents!([
2510            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
2511            { "id": 3, "name": "bob", "age": 25 },
2512        ]);
2513        indexer.update_documents(&documents).unwrap();
2514
2515        let (document_changes, _operation_stats, primary_key) = indexer
2516            .into_changes(
2517                &indexer_alloc,
2518                &index.inner,
2519                &rtxn,
2520                None,
2521                &mut new_fields_ids_map,
2522                &|| false,
2523                Progress::default(),
2524            )
2525            .unwrap();
2526
2527        indexer::index(
2528            &mut wtxn,
2529            &index.inner,
2530            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2531            indexer_config.grenad_parameters(),
2532            &db_fields_ids_map,
2533            new_fields_ids_map,
2534            primary_key,
2535            &document_changes,
2536            embedders,
2537            &|| false,
2538            &Progress::default(),
2539        )
2540        .unwrap();
2541        wtxn.commit().unwrap();
2542
2543        db_snap!(index, documents, @r###"
2544        {"id":2,"doggo":{"name":"jean","age":20}}
2545        {"id":3,"name":"bob","age":25}
2546        "###);
2547    }
2548
2549    #[test]
2550    fn delete_the_same_document_multiple_time() {
2551        let index = TempIndex::new();
2552
2553        let mut wtxn = index.write_txn().unwrap();
2554        let indexer_config = &index.indexer_config;
2555        let rtxn = index.inner.read_txn().unwrap();
2556        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2557        let mut new_fields_ids_map = db_fields_ids_map.clone();
2558
2559        let indexer_alloc = Bump::new();
2560        let embedders = EmbeddingConfigs::default();
2561        let mut indexer = indexer::DocumentOperation::new();
2562
2563        indexer.delete_documents(&["1", "2", "1", "2"]);
2564
2565        let documents = documents!([
2566            { "id": 1, "doggo": "kevin" },
2567            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
2568            { "id": 3, "name": "bob", "age": 25 },
2569        ]);
2570        indexer.update_documents(&documents).unwrap();
2571
2572        indexer.delete_documents(&["1", "2", "1", "2"]);
2573
2574        let (document_changes, _operation_stats, primary_key) = indexer
2575            .into_changes(
2576                &indexer_alloc,
2577                &index.inner,
2578                &rtxn,
2579                None,
2580                &mut new_fields_ids_map,
2581                &|| false,
2582                Progress::default(),
2583            )
2584            .unwrap();
2585
2586        indexer::index(
2587            &mut wtxn,
2588            &index.inner,
2589            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2590            indexer_config.grenad_parameters(),
2591            &db_fields_ids_map,
2592            new_fields_ids_map,
2593            primary_key,
2594            &document_changes,
2595            embedders,
2596            &|| false,
2597            &Progress::default(),
2598        )
2599        .unwrap();
2600        wtxn.commit().unwrap();
2601
2602        db_snap!(index, documents, @r###"
2603        {"id":3,"name":"bob","age":25}
2604        "###);
2605    }
2606
2607    #[test]
2608    fn add_document_and_in_another_transform_delete_the_document_then_add_it_again() {
2609        let index = TempIndex::new();
2610
2611        let mut wtxn = index.write_txn().unwrap();
2612        let indexer_config = &index.indexer_config;
2613        let rtxn = index.inner.read_txn().unwrap();
2614        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2615        let mut new_fields_ids_map = db_fields_ids_map.clone();
2616
2617        let indexer_alloc = Bump::new();
2618        let embedders = EmbeddingConfigs::default();
2619        let mut indexer = indexer::DocumentOperation::new();
2620
2621        let documents = documents!([
2622            { "id": 1, "doggo": "kevin" },
2623        ]);
2624        indexer.update_documents(&documents).unwrap();
2625
2626        let (document_changes, _operation_stats, primary_key) = indexer
2627            .into_changes(
2628                &indexer_alloc,
2629                &index.inner,
2630                &rtxn,
2631                None,
2632                &mut new_fields_ids_map,
2633                &|| false,
2634                Progress::default(),
2635            )
2636            .unwrap();
2637
2638        indexer::index(
2639            &mut wtxn,
2640            &index.inner,
2641            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2642            indexer_config.grenad_parameters(),
2643            &db_fields_ids_map,
2644            new_fields_ids_map,
2645            primary_key,
2646            &document_changes,
2647            embedders,
2648            &|| false,
2649            &Progress::default(),
2650        )
2651        .unwrap();
2652        wtxn.commit().unwrap();
2653
2654        db_snap!(index, documents, @r###"
2655        {"id":1,"doggo":"kevin"}
2656        "###);
2657
2658        // A first batch of documents has been inserted
2659
2660        let mut wtxn = index.write_txn().unwrap();
2661        let indexer_config = &index.indexer_config;
2662        let rtxn = index.inner.read_txn().unwrap();
2663        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2664        let mut new_fields_ids_map = db_fields_ids_map.clone();
2665
2666        let indexer_alloc = Bump::new();
2667        let embedders = EmbeddingConfigs::default();
2668        let mut indexer = indexer::DocumentOperation::new();
2669
2670        indexer.delete_documents(&["1"]);
2671
2672        let documents = documents!([
2673            { "id": 1, "catto": "jorts" },
2674        ]);
2675
2676        indexer.replace_documents(&documents).unwrap();
2677
2678        let (document_changes, _operation_stats, primary_key) = indexer
2679            .into_changes(
2680                &indexer_alloc,
2681                &index.inner,
2682                &rtxn,
2683                None,
2684                &mut new_fields_ids_map,
2685                &|| false,
2686                Progress::default(),
2687            )
2688            .unwrap();
2689
2690        indexer::index(
2691            &mut wtxn,
2692            &index.inner,
2693            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2694            indexer_config.grenad_parameters(),
2695            &db_fields_ids_map,
2696            new_fields_ids_map,
2697            primary_key,
2698            &document_changes,
2699            embedders,
2700            &|| false,
2701            &Progress::default(),
2702        )
2703        .unwrap();
2704        wtxn.commit().unwrap();
2705
2706        db_snap!(index, documents, @r###"
2707        {"id":1,"catto":"jorts"}
2708        "###);
2709    }
2710
2711    #[test]
2712    fn test_word_fid_position() {
2713        let index = TempIndex::new();
2714
2715        index
2716            .add_documents(documents!([
2717              {"id": 0, "text": "sun flowers are looking at the sun" },
2718              {"id": 1, "text": "sun flowers are looking at the sun" },
2719              {"id": 2, "text": "the sun is shining today" },
2720              {
2721                "id": 3,
2722                "text": "a a a a a a a a a a a a a a a a a
2723                a a a a a a a a a a a a a a a a a a a a a a a a a a
2724                a a a a a a a a a a a a a a a a a a a a a a a a a a
2725                a a a a a a a a a a a a a a a a a a a a a a a a a a
2726                a a a a a a a a a a a a a a a a a a a a a a a a a a
2727                a a a a a a a a a a a a a a a a a a a a a a a a a a
2728                a a a a a a a a a a a a a a a a a a a a a "
2729             }
2730            ]))
2731            .unwrap();
2732
2733        db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
2734        db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
2735
2736        index
2737            .add_documents(documents!([
2738              {"id": 4, "text": "sun flowers are looking at the sun" },
2739              {"id": 5, "text2": "sun flowers are looking at the sun" },
2740              {"id": 6, "text": "b b b" },
2741              {
2742                "id": 7,
2743                "text2": "a a a a"
2744             }
2745            ]))
2746            .unwrap();
2747
2748        db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
2749        db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
2750
2751        // Delete not all of the documents but some of them.
2752        index.delete_documents(vec!["0".into(), "3".into()]);
2753
2754        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
2755        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
2756    }
2757
2758    /// Index multiple different number of vectors in documents.
2759    /// Vectors must be of the same length.
2760    #[test]
2761    fn test_multiple_vectors() {
2762        use crate::vector::settings::EmbeddingSettings;
2763        let index = TempIndex::new();
2764
2765        index
2766            .update_settings(|settings| {
2767                let mut embedders = BTreeMap::default();
2768                embedders.insert(
2769                    "manual".to_string(),
2770                    Setting::Set(EmbeddingSettings {
2771                        source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided),
2772                        model: Setting::NotSet,
2773                        revision: Setting::NotSet,
2774                        pooling: Setting::NotSet,
2775                        api_key: Setting::NotSet,
2776                        dimensions: Setting::Set(3),
2777                        document_template: Setting::NotSet,
2778                        document_template_max_bytes: Setting::NotSet,
2779                        url: Setting::NotSet,
2780                        request: Setting::NotSet,
2781                        response: Setting::NotSet,
2782                        distribution: Setting::NotSet,
2783                        headers: Setting::NotSet,
2784                        search_embedder: Setting::NotSet,
2785                        indexing_embedder: Setting::NotSet,
2786                        binary_quantized: Setting::NotSet,
2787                    }),
2788                );
2789                settings.set_embedder_settings(embedders);
2790            })
2791            .unwrap();
2792
2793        index
2794            .add_documents(
2795                documents!([{"id": 0, "_vectors": { "manual": [[0, 1, 2], [3, 4, 5]] } }]),
2796            )
2797            .unwrap();
2798        index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap();
2799        index
2800               .add_documents(
2801                   documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]),
2802               )
2803               .unwrap();
2804
2805        let rtxn = index.read_txn().unwrap();
2806        let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
2807        let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
2808            embedding_configs.pop().unwrap();
2809        insta::assert_snapshot!(embedder_name, @"manual");
2810        insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
2811        let embedder = std::sync::Arc::new(
2812            crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(),
2813        );
2814        let res = index
2815            .search(&rtxn)
2816            .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()))
2817            .execute()
2818            .unwrap();
2819        assert_eq!(res.documents_ids.len(), 3);
2820    }
2821
2822    #[test]
2823    fn reproduce_the_bug() {
2824        /*
2825            [milli/examples/fuzz.rs:69] &batches = [
2826            Batch(
2827                [
2828                    AddDoc(
2829                        { "id": 1, "doggo": "bernese" }, => internal 0
2830                    ),
2831                ],
2832            ),
2833            Batch(
2834                [
2835                    DeleteDoc(
2836                        1, => delete internal 0
2837                    ),
2838                    AddDoc(
2839                        { "id": 0, "catto": "jorts" }, => internal 1
2840                    ),
2841                ],
2842            ),
2843            Batch(
2844                [
2845                    AddDoc(
2846                        { "id": 1, "catto": "jorts" }, => internal 2
2847                    ),
2848                ],
2849            ),
2850        ]
2851        */
2852        let index = TempIndex::new();
2853
2854        // START OF BATCH
2855
2856        println!("--- ENTERING BATCH 1");
2857
2858        let mut wtxn = index.write_txn().unwrap();
2859        let indexer_config = &index.indexer_config;
2860        let rtxn = index.inner.read_txn().unwrap();
2861        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2862        let mut new_fields_ids_map = db_fields_ids_map.clone();
2863
2864        let indexer_alloc = Bump::new();
2865        let embedders = EmbeddingConfigs::default();
2866        let mut indexer = indexer::DocumentOperation::new();
2867
2868        // OP
2869
2870        let documents = documents!([
2871            { "id": 1, "doggo": "bernese" },
2872        ]);
2873        indexer.replace_documents(&documents).unwrap();
2874
2875        // FINISHING
2876        let (document_changes, _operation_stats, primary_key) = indexer
2877            .into_changes(
2878                &indexer_alloc,
2879                &index.inner,
2880                &rtxn,
2881                None,
2882                &mut new_fields_ids_map,
2883                &|| false,
2884                Progress::default(),
2885            )
2886            .unwrap();
2887
2888        indexer::index(
2889            &mut wtxn,
2890            &index.inner,
2891            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2892            indexer_config.grenad_parameters(),
2893            &db_fields_ids_map,
2894            new_fields_ids_map,
2895            primary_key,
2896            &document_changes,
2897            embedders,
2898            &|| false,
2899            &Progress::default(),
2900        )
2901        .unwrap();
2902        wtxn.commit().unwrap();
2903
2904        db_snap!(index, documents, @r###"
2905        {"id":1,"doggo":"bernese"}
2906        "###);
2907        db_snap!(index, external_documents_ids, @r###"
2908        docids:
2909        1                        0
2910        "###);
2911
2912        // A first batch of documents has been inserted
2913
2914        // BATCH 2
2915
2916        println!("--- ENTERING BATCH 2");
2917
2918        let mut wtxn = index.write_txn().unwrap();
2919        let indexer_config = &index.indexer_config;
2920        let rtxn = index.inner.read_txn().unwrap();
2921        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2922        let mut new_fields_ids_map = db_fields_ids_map.clone();
2923
2924        let indexer_alloc = Bump::new();
2925        let embedders = EmbeddingConfigs::default();
2926        let mut indexer = indexer::DocumentOperation::new();
2927
2928        indexer.delete_documents(&["1"]);
2929
2930        let documents = documents!([
2931            { "id": 0, "catto": "jorts" },
2932        ]);
2933        indexer.replace_documents(&documents).unwrap();
2934
2935        let (document_changes, _operation_stats, primary_key) = indexer
2936            .into_changes(
2937                &indexer_alloc,
2938                &index.inner,
2939                &rtxn,
2940                None,
2941                &mut new_fields_ids_map,
2942                &|| false,
2943                Progress::default(),
2944            )
2945            .unwrap();
2946
2947        indexer::index(
2948            &mut wtxn,
2949            &index.inner,
2950            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2951            indexer_config.grenad_parameters(),
2952            &db_fields_ids_map,
2953            new_fields_ids_map,
2954            primary_key,
2955            &document_changes,
2956            embedders,
2957            &|| false,
2958            &Progress::default(),
2959        )
2960        .unwrap();
2961        wtxn.commit().unwrap();
2962
2963        db_snap!(index, documents, @r###"
2964        {"id":0,"catto":"jorts"}
2965        "###);
2966
2967        db_snap!(index, external_documents_ids, @r###"
2968        docids:
2969        0                        1
2970        "###);
2971
2972        // BATCH 3
2973
2974        println!("--- ENTERING BATCH 3");
2975
2976        let mut wtxn = index.write_txn().unwrap();
2977        let indexer_config = &index.indexer_config;
2978        let rtxn = index.inner.read_txn().unwrap();
2979        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2980        let mut new_fields_ids_map = db_fields_ids_map.clone();
2981
2982        let indexer_alloc = Bump::new();
2983        let embedders = EmbeddingConfigs::default();
2984        let mut indexer = indexer::DocumentOperation::new();
2985
2986        let documents = documents!([
2987            { "id": 1, "catto": "jorts" },
2988        ]);
2989        indexer.replace_documents(&documents).unwrap();
2990
2991        let (document_changes, _operation_stats, primary_key) = indexer
2992            .into_changes(
2993                &indexer_alloc,
2994                &index.inner,
2995                &rtxn,
2996                None,
2997                &mut new_fields_ids_map,
2998                &|| false,
2999                Progress::default(),
3000            )
3001            .unwrap();
3002
3003        indexer::index(
3004            &mut wtxn,
3005            &index.inner,
3006            &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
3007            indexer_config.grenad_parameters(),
3008            &db_fields_ids_map,
3009            new_fields_ids_map,
3010            primary_key,
3011            &document_changes,
3012            embedders,
3013            &|| false,
3014            &Progress::default(),
3015        )
3016        .unwrap();
3017        wtxn.commit().unwrap();
3018
3019        db_snap!(index, documents, @r###"
3020        {"id":1,"catto":"jorts"}
3021        {"id":0,"catto":"jorts"}
3022        "###);
3023
3024        // Ensuring all the returned IDs actually exists
3025        let rtxn = index.read_txn().unwrap();
3026        let res = index.search(&rtxn).execute().unwrap();
3027        index.documents(&rtxn, res.documents_ids).unwrap();
3028    }
3029
3030    fn delete_documents<'t>(
3031        wtxn: &mut RwTxn<'t>,
3032        index: &'t TempIndex,
3033        external_ids: &[&str],
3034    ) -> Vec<u32> {
3035        let external_document_ids = index.external_documents_ids();
3036        let ids_to_delete: Vec<u32> = external_ids
3037            .iter()
3038            .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
3039            .collect();
3040
3041        // Delete some documents.
3042        index
3043            .delete_documents_using_wtxn(
3044                wtxn,
3045                external_ids.iter().map(ToString::to_string).collect(),
3046            )
3047            .unwrap();
3048
3049        ids_to_delete
3050    }
3051
3052    #[test]
3053    fn delete_documents_with_numbers_as_primary_key() {
3054        let index = TempIndex::new();
3055
3056        let mut wtxn = index.write_txn().unwrap();
3057        index
3058            .add_documents_using_wtxn(
3059                &mut wtxn,
3060                documents!([
3061                    { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
3062                    { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
3063                    { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
3064                ]),
3065            )
3066            .unwrap();
3067        wtxn.commit().unwrap();
3068
3069        let mut wtxn = index.write_txn().unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2.
3070        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]).unwrap();
3071        wtxn.commit().unwrap();
3072
3073        // All these snapshots should be empty since the database was cleared
3074        db_snap!(index, documents_ids);
3075        db_snap!(index, word_docids);
3076        db_snap!(index, word_pair_proximity_docids);
3077        db_snap!(index, facet_id_exists_docids);
3078
3079        let rtxn = index.read_txn().unwrap();
3080
3081        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
3082    }
3083
3084    #[test]
3085    fn delete_documents_with_strange_primary_key() {
3086        let index = TempIndex::new();
3087
3088        index
3089            .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
3090            .unwrap();
3091
3092        let mut wtxn = index.write_txn().unwrap();
3093        index
3094            .add_documents_using_wtxn(
3095                &mut wtxn,
3096                documents!([
3097                    { "mysuperid": 0, "name": "kevin" },
3098                    { "mysuperid": 1, "name": "kevina" },
3099                    { "mysuperid": 2, "name": "benoit" }
3100                ]),
3101            )
3102            .unwrap();
3103        wtxn.commit().unwrap();
3104
3105        let mut wtxn = index.write_txn().unwrap();
3106        // Delete not all of the documents but some of them.
3107        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]).unwrap();
3108
3109        wtxn.commit().unwrap();
3110
3111        db_snap!(index, documents_ids);
3112        db_snap!(index, word_docids);
3113        db_snap!(index, word_pair_proximity_docids);
3114    }
3115
3116    #[test]
3117    fn filtered_placeholder_search_should_not_return_deleted_documents() {
3118        let index = TempIndex::new();
3119
3120        let mut wtxn = index.write_txn().unwrap();
3121        index
3122            .update_settings_using_wtxn(&mut wtxn, |settings| {
3123                settings.set_primary_key(S("docid"));
3124                settings.set_filterable_fields(vec![
3125                    FilterableAttributesRule::Field("label".to_string()),
3126                    FilterableAttributesRule::Field("label2".to_string()),
3127                ]);
3128            })
3129            .unwrap();
3130        wtxn.commit().unwrap();
3131
3132        let mut wtxn = index.write_txn().unwrap();
3133        index
3134            .add_documents_using_wtxn(
3135                &mut wtxn,
3136                documents!([
3137                    { "docid": "1_4",  "label": ["sign"] },
3138                    { "docid": "1_5",  "label": ["letter"] },
3139                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
3140                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3141                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3142                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3143                    { "docid": "1_39", "label": ["abstract"] },
3144                    { "docid": "1_40", "label": ["cartoon"] },
3145                    { "docid": "1_41", "label": ["art","drawing"] },
3146                    { "docid": "1_42", "label": ["art","pattern"] },
3147                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3148                    { "docid": "1_44", "label": ["drawing"] },
3149                    { "docid": "1_45", "label": ["art"] },
3150                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3151                    { "docid": "1_47", "label": ["abstract","pattern"] },
3152                    { "docid": "1_52", "label": ["abstract","cartoon"] },
3153                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3154                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3155                    { "docid": "1_68", "label": ["design"] },
3156                    { "docid": "1_69", "label": ["geometry"] },
3157                    { "docid": "1_70", "label2": ["geometry", 1.2] },
3158                    { "docid": "1_71", "label2": ["design", 2.2] },
3159                    { "docid": "1_72", "label2": ["geometry", 1.2] }
3160                ]),
3161            )
3162            .unwrap();
3163
3164        wtxn.commit().unwrap();
3165
3166        let mut wtxn = index.write_txn().unwrap();
3167        delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
3168        wtxn.commit().unwrap();
3169
3170        let rtxn = index.read_txn().unwrap();
3171        // Placeholder search with filter
3172        let filter = Filter::from_str("label = sign").unwrap().unwrap();
3173        let results = index.search(&rtxn).filter(filter).execute().unwrap();
3174        assert!(results.documents_ids.is_empty());
3175
3176        db_snap!(index, word_docids);
3177        db_snap!(index, facet_id_f64_docids);
3178        db_snap!(index, word_pair_proximity_docids);
3179        db_snap!(index, facet_id_exists_docids);
3180        db_snap!(index, facet_id_string_docids);
3181    }
3182
3183    #[test]
3184    fn placeholder_search_should_not_return_deleted_documents() {
3185        let index = TempIndex::new();
3186
3187        index
3188            .update_settings(|settings| {
3189                settings.set_primary_key(S("docid"));
3190            })
3191            .unwrap();
3192
3193        index
3194            .add_documents(documents!([
3195                { "docid": "1_4",  "label": ["sign"] },
3196                { "docid": "1_5",  "label": ["letter"] },
3197                { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
3198                { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3199                { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3200                { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3201                { "docid": "1_39", "label": ["abstract"] },
3202                { "docid": "1_40", "label": ["cartoon"] },
3203                { "docid": "1_41", "label": ["art","drawing"] },
3204                { "docid": "1_42", "label": ["art","pattern"] },
3205                { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3206                { "docid": "1_44", "label": ["drawing"] },
3207                { "docid": "1_45", "label": ["art"] },
3208                { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3209                { "docid": "1_47", "label": ["abstract","pattern"] },
3210                { "docid": "1_52", "label": ["abstract","cartoon"] },
3211                { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3212                { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3213                { "docid": "1_68", "label": ["design"] },
3214                { "docid": "1_69", "label": ["geometry"] },
3215                { "docid": "1_70", "label2": ["geometry", 1.2] },
3216                { "docid": "1_71", "label2": ["design", 2.2] },
3217                { "docid": "1_72", "label2": ["geometry", 1.2] }
3218            ]))
3219            .unwrap();
3220
3221        let mut wtxn = index.write_txn().unwrap();
3222
3223        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
3224
3225        wtxn.commit().unwrap();
3226
3227        // Placeholder search
3228        let rtxn = index.static_read_txn().unwrap();
3229
3230        let results = index.search(&rtxn).execute().unwrap();
3231        assert!(!results.documents_ids.is_empty());
3232        for id in results.documents_ids.iter() {
3233            assert!(
3234                !deleted_internal_ids.contains(id),
3235                "The document {} was supposed to be deleted",
3236                id
3237            );
3238        }
3239
3240        drop(rtxn);
3241    }
3242
3243    #[test]
3244    fn search_should_not_return_deleted_documents() {
3245        let index = TempIndex::new();
3246
3247        index
3248            .update_settings(|settings| {
3249                settings.set_primary_key(S("docid"));
3250            })
3251            .unwrap();
3252
3253        index
3254            .add_documents(documents!([
3255                { "docid": "1_4",  "label": ["sign"] },
3256                { "docid": "1_5",  "label": ["letter"] },
3257                { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
3258                { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3259                { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3260                { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3261                { "docid": "1_39", "label": ["abstract"] },
3262                { "docid": "1_40", "label": ["cartoon"] },
3263                { "docid": "1_41", "label": ["art","drawing"] },
3264                { "docid": "1_42", "label": ["art","pattern"] },
3265                { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3266                { "docid": "1_44", "label": ["drawing"] },
3267                { "docid": "1_45", "label": ["art"] },
3268                { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3269                { "docid": "1_47", "label": ["abstract","pattern"] },
3270                { "docid": "1_52", "label": ["abstract","cartoon"] },
3271                { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3272                { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3273                { "docid": "1_68", "label": ["design"] },
3274                { "docid": "1_69", "label": ["geometry"] },
3275                { "docid": "1_70", "label2": ["geometry", 1.2] },
3276                { "docid": "1_71", "label2": ["design", 2.2] },
3277                { "docid": "1_72", "label2": ["geometry", 1.2] }
3278            ]))
3279            .unwrap();
3280
3281        let mut wtxn = index.write_txn().unwrap();
3282        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
3283        wtxn.commit().unwrap();
3284
3285        // search for abstract
3286        let rtxn = index.read_txn().unwrap();
3287        let results = index.search(&rtxn).query("abstract").execute().unwrap();
3288        assert!(!results.documents_ids.is_empty());
3289        for id in results.documents_ids.iter() {
3290            assert!(
3291                !deleted_internal_ids.contains(id),
3292                "The document {} was supposed to be deleted",
3293                id
3294            );
3295        }
3296    }
3297
3298    #[test]
3299    fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
3300        let index = TempIndex::new();
3301
3302        let mut wtxn = index.write_txn().unwrap();
3303        index
3304            .update_settings_using_wtxn(&mut wtxn, |settings| {
3305                settings.set_primary_key(S("id"));
3306                settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
3307                    RESERVED_GEO_FIELD_NAME.to_string(),
3308                )]);
3309                settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME)));
3310            })
3311            .unwrap();
3312        wtxn.commit().unwrap();
3313
3314        let mut wtxn = index.write_txn().unwrap();
3315        index.add_documents_using_wtxn(&mut wtxn, documents!([
3316            { "id": "1",  "city": "Lille",             RESERVED_GEO_FIELD_NAME: { "lat": 50.6299, "lng": 3.0569 } },
3317            { "id": "2",  "city": "Mons-en-Barœul",    RESERVED_GEO_FIELD_NAME: { "lat": 50.6415, "lng": 3.1106 } },
3318            { "id": "3",  "city": "Hellemmes",         RESERVED_GEO_FIELD_NAME: { "lat": 50.6312, "lng": 3.1106 } },
3319            { "id": "4",  "city": "Villeneuve-d'Ascq", RESERVED_GEO_FIELD_NAME: { "lat": 50.6224, "lng": 3.1476 } },
3320            { "id": "5",  "city": "Hem",               RESERVED_GEO_FIELD_NAME: { "lat": 50.6552, "lng": 3.1897 } },
3321            { "id": "6",  "city": "Roubaix",           RESERVED_GEO_FIELD_NAME: { "lat": 50.6924, "lng": 3.1763 } },
3322            { "id": "7",  "city": "Tourcoing",         RESERVED_GEO_FIELD_NAME: { "lat": 50.7263, "lng": 3.1541 } },
3323            { "id": "8",  "city": "Mouscron",          RESERVED_GEO_FIELD_NAME: { "lat": 50.7453, "lng": 3.2206 } },
3324            { "id": "9",  "city": "Tournai",           RESERVED_GEO_FIELD_NAME: { "lat": 50.6053, "lng": 3.3758 } },
3325            { "id": "10", "city": "Ghent",             RESERVED_GEO_FIELD_NAME: { "lat": 51.0537, "lng": 3.6957 } },
3326            { "id": "11", "city": "Brussels",          RESERVED_GEO_FIELD_NAME: { "lat": 50.8466, "lng": 4.3370 } },
3327            { "id": "12", "city": "Charleroi",         RESERVED_GEO_FIELD_NAME: { "lat": 50.4095, "lng": 4.4347 } },
3328            { "id": "13", "city": "Mons",              RESERVED_GEO_FIELD_NAME: { "lat": 50.4502, "lng": 3.9623 } },
3329            { "id": "14", "city": "Valenciennes",      RESERVED_GEO_FIELD_NAME: { "lat": 50.3518, "lng": 3.5326 } },
3330            { "id": "15", "city": "Arras",             RESERVED_GEO_FIELD_NAME: { "lat": 50.2844, "lng": 2.7637 } },
3331            { "id": "16", "city": "Cambrai",           RESERVED_GEO_FIELD_NAME: { "lat": 50.1793, "lng": 3.2189 } },
3332            { "id": "17", "city": "Bapaume",           RESERVED_GEO_FIELD_NAME: { "lat": 50.1112, "lng": 2.8547 } },
3333            { "id": "18", "city": "Amiens",            RESERVED_GEO_FIELD_NAME: { "lat": 49.9314, "lng": 2.2710 } },
3334            { "id": "19", "city": "Compiègne",         RESERVED_GEO_FIELD_NAME: { "lat": 49.4449, "lng": 2.7913 } },
3335            { "id": "20", "city": "Paris",             RESERVED_GEO_FIELD_NAME: { "lat": 48.9021, "lng": 2.3708 } }
3336        ])).unwrap();
3337        wtxn.commit().unwrap();
3338
3339        let mut wtxn = index.write_txn().unwrap();
3340        let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
3341        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
3342
3343        // Placeholder search with geo filter
3344        let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
3345        let results = index.search(&wtxn).filter(filter).execute().unwrap();
3346        assert!(!results.documents_ids.is_empty());
3347        for id in results.documents_ids.iter() {
3348            assert!(
3349                !deleted_internal_ids.contains(id),
3350                "The document {} was supposed to be deleted",
3351                id
3352            );
3353        }
3354
3355        wtxn.commit().unwrap();
3356
3357        db_snap!(index, facet_id_f64_docids);
3358        db_snap!(index, facet_id_string_docids);
3359    }
3360
3361    #[test]
3362    fn get_documents_should_not_return_deleted_documents() {
3363        let index = TempIndex::new();
3364
3365        let mut wtxn = index.write_txn().unwrap();
3366        index
3367            .update_settings_using_wtxn(&mut wtxn, |settings| {
3368                settings.set_primary_key(S("docid"));
3369            })
3370            .unwrap();
3371
3372        index
3373            .add_documents_using_wtxn(
3374                &mut wtxn,
3375                documents!([
3376                    { "docid": "1_4",  "label": ["sign"] },
3377                    { "docid": "1_5",  "label": ["letter"] },
3378                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
3379                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3380                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3381                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3382                    { "docid": "1_39", "label": ["abstract"] },
3383                    { "docid": "1_40", "label": ["cartoon"] },
3384                    { "docid": "1_41", "label": ["art","drawing"] },
3385                    { "docid": "1_42", "label": ["art","pattern"] },
3386                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3387                    { "docid": "1_44", "label": ["drawing"] },
3388                    { "docid": "1_45", "label": ["art"] },
3389                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3390                    { "docid": "1_47", "label": ["abstract","pattern"] },
3391                    { "docid": "1_52", "label": ["abstract","cartoon"] },
3392                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3393                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3394                    { "docid": "1_68", "label": ["design"] },
3395                    { "docid": "1_69", "label": ["geometry"] },
3396                    { "docid": "1_70", "label2": ["geometry", 1.2] },
3397                    { "docid": "1_71", "label2": ["design", 2.2] },
3398                    { "docid": "1_72", "label2": ["geometry", 1.2] }
3399                ]),
3400            )
3401            .unwrap();
3402        wtxn.commit().unwrap();
3403
3404        let mut wtxn = index.write_txn().unwrap();
3405        let deleted_external_ids = ["1_7", "1_52"];
3406        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
3407        wtxn.commit().unwrap();
3408
3409        let rtxn = index.read_txn().unwrap();
3410        // list all documents
3411        let results = index.all_documents(&rtxn).unwrap();
3412        for result in results {
3413            let (id, _) = result.unwrap();
3414            assert!(
3415                !deleted_internal_ids.contains(&id),
3416                "The document {} was supposed to be deleted",
3417                id
3418            );
3419        }
3420
3421        // list internal document ids
3422        let results = index.documents_ids(&rtxn).unwrap();
3423        for id in results {
3424            assert!(
3425                !deleted_internal_ids.contains(&id),
3426                "The document {} was supposed to be deleted",
3427                id
3428            );
3429        }
3430
3431        // get internal docids from deleted external document ids
3432        let results = index.external_documents_ids();
3433        for id in deleted_external_ids {
3434            assert!(
3435                results.get(&rtxn, id).unwrap().is_none(),
3436                "The document {} was supposed to be deleted",
3437                id
3438            );
3439        }
3440        drop(rtxn);
3441    }
3442
3443    #[test]
3444    fn stats_should_not_return_deleted_documents() {
3445        let index = TempIndex::new();
3446
3447        index
3448            .update_settings(|settings| {
3449                settings.set_primary_key(S("docid"));
3450            })
3451            .unwrap();
3452
3453        index.add_documents(documents!([
3454            { "docid": "1_4",  "label": ["sign"]},
3455            { "docid": "1_5",  "label": ["letter"]},
3456            { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
3457            { "docid": "1_36", "label": ["drawing","painting","pattern"]},
3458            { "docid": "1_37", "label": ["art","drawing","outdoor"]},
3459            { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
3460            { "docid": "1_39", "label": ["abstract"]},
3461            { "docid": "1_40", "label": ["cartoon"]},
3462            { "docid": "1_41", "label": ["art","drawing"]},
3463            { "docid": "1_42", "label": ["art","pattern"]},
3464            { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
3465            { "docid": "1_44", "label": ["drawing"], "number": 44i32},
3466            { "docid": "1_45", "label": ["art"]},
3467            { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
3468            { "docid": "1_47", "label": ["abstract","pattern"]},
3469            { "docid": "1_52", "label": ["abstract","cartoon"]},
3470            { "docid": "1_57", "label": ["abstract","drawing","pattern"]},
3471            { "docid": "1_58", "label": ["abstract","art","cartoon"]},
3472            { "docid": "1_68", "label": ["design"]},
3473            { "docid": "1_69", "label": ["geometry"]}
3474        ])).unwrap();
3475
3476        let mut wtxn = index.write_txn().unwrap();
3477
3478        delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
3479        wtxn.commit().unwrap();
3480
3481        let rtxn = index.read_txn().unwrap();
3482
3483        // count internal documents
3484        let results = index.number_of_documents(&rtxn).unwrap();
3485        assert_eq!(18, results);
3486
3487        // count field distribution
3488        let results = index.field_distribution(&rtxn).unwrap();
3489        assert_eq!(Some(&18), results.get("label"));
3490        assert_eq!(Some(&1), results.get("title"));
3491        assert_eq!(Some(&2), results.get("number"));
3492
3493        rtxn.commit().unwrap();
3494    }
3495
3496    #[test]
3497    fn incremental_update_without_changing_facet_distribution() {
3498        let index = TempIndex::new();
3499        index
3500            .add_documents(documents!([
3501                {"id": 0, "some_field": "aaa", "other_field": "aaa" },
3502                {"id": 1, "some_field": "bbb", "other_field": "bbb" },
3503            ]))
3504            .unwrap();
3505        {
3506            let rtxn = index.read_txn().unwrap();
3507            // count field distribution
3508            let results = index.field_distribution(&rtxn).unwrap();
3509            assert_eq!(Some(&2), results.get("id"));
3510            assert_eq!(Some(&2), results.get("some_field"));
3511            assert_eq!(Some(&2), results.get("other_field"));
3512        }
3513
3514        let mut index = index;
3515        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
3516
3517        index
3518            .add_documents(documents!([
3519                {"id": 0, "other_field": "bbb" },
3520                {"id": 1, "some_field": "ccc" },
3521            ]))
3522            .unwrap();
3523
3524        {
3525            let rtxn = index.read_txn().unwrap();
3526            // count field distribution
3527            let results = index.field_distribution(&rtxn).unwrap();
3528            assert_eq!(Some(&2), results.get("id"));
3529            assert_eq!(Some(&2), results.get("some_field"));
3530            assert_eq!(Some(&2), results.get("other_field"));
3531        }
3532    }
3533
3534    #[test]
3535    fn delete_words_exact_attributes() {
3536        let index = TempIndex::new();
3537
3538        index
3539            .update_settings(|settings| {
3540                settings.set_primary_key(S("id"));
3541                settings.set_searchable_fields(vec![S("text"), S("exact")]);
3542                settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
3543            })
3544            .unwrap();
3545
3546        index
3547            .add_documents(documents!([
3548                { "id": 0, "text": "hello" },
3549                { "id": 1, "exact": "hello"}
3550            ]))
3551            .unwrap();
3552        db_snap!(index, word_docids, 1, @r###"
3553        hello            [0, ]
3554        "###);
3555        db_snap!(index, exact_word_docids, 1, @r###"
3556        hello            [1, ]
3557        "###);
3558        db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
3559
3560        let mut wtxn = index.write_txn().unwrap();
3561        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
3562        wtxn.commit().unwrap();
3563
3564        db_snap!(index, word_docids, 2, @r###"
3565        hello            [0, ]
3566        "###);
3567        db_snap!(index, exact_word_docids, 2, @"");
3568        db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
3569
3570        insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
3571        let txn = index.read_txn().unwrap();
3572        let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
3573        insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
3574
3575        let mut s = Search::new(&txn, &index);
3576        s.query("hello");
3577        let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
3578        insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
3579    }
3580}
milli_core/update/index_documents/mod.rs

milli_core/update/index_documents/
mod.rs