1mod enrich;
2mod extract;
3mod helpers;
4mod transform;
5mod typed_chunk;
6
7use std::collections::HashSet;
8use std::io::{Read, Seek};
9use std::iter;
10use std::num::NonZeroU32;
11use std::sync::Arc;
12
13use crossbeam_channel::{Receiver, Sender};
14use enrich::enrich_documents_batch;
15use grenad::{Merger, MergerBuilder};
16use hashbrown::HashMap;
17use heed::types::Str;
18use heed::Database;
19use rand::SeedableRng as _;
20use roaring::RoaringBitmap;
21use serde::{Deserialize, Serialize};
22use slice_group_by::GroupBy;
23use tracing::debug;
24use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
25
26pub use self::enrich::{extract_finite_float_from_value, DocumentId};
27pub use self::helpers::*;
28pub use self::transform::{Transform, TransformOutput};
29use super::facet::clear_facet_levels_based_on_settings_diff;
30use super::new::StdResult;
31use crate::database_stats::DatabaseStats;
32use crate::documents::{obkv_to_object, DocumentsBatchReader};
33use crate::error::{Error, InternalError};
34use crate::index::{PrefixSearch, PrefixSettings};
35use crate::progress::Progress;
36use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
37pub use crate::update::index_documents::helpers::CursorClonableMmap;
38use crate::update::{
39 IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
40};
41use crate::vector::{ArroyWrapper, EmbeddingConfigs};
42use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
43
44static MERGED_DATABASE_COUNT: usize = 7;
45static PREFIX_DATABASE_COUNT: usize = 4;
46static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
47
48#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
49pub struct DocumentAdditionResult {
50 pub indexed_documents: u64,
52 pub number_of_documents: u64,
54}
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
57#[non_exhaustive]
58pub enum IndexDocumentsMethod {
59 ReplaceDocuments,
62
63 UpdateDocuments,
66}
67
68impl Default for IndexDocumentsMethod {
69 fn default() -> Self {
70 Self::ReplaceDocuments
71 }
72}
73
74pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
75 wtxn: &'t mut heed::RwTxn<'i>,
76 index: &'i Index,
77 config: IndexDocumentsConfig,
78 indexer_config: &'a IndexerConfig,
79 transform: Option<Transform<'a, 'i>>,
80 progress: FP,
81 should_abort: FA,
82 added_documents: u64,
83 deleted_documents: u64,
84 embedders: EmbeddingConfigs,
85}
86
87#[derive(Default, Debug, Clone)]
88pub struct IndexDocumentsConfig {
89 pub words_positions_level_group_size: Option<NonZeroU32>,
90 pub words_positions_min_level_size: Option<NonZeroU32>,
91 pub update_method: IndexDocumentsMethod,
92 pub autogenerate_docids: bool,
93}
94
95impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
96where
97 FP: Fn(UpdateIndexingStep) + Sync + Send,
98 FA: Fn() -> bool + Sync + Send,
99{
100 pub fn new(
101 wtxn: &'t mut heed::RwTxn<'i>,
102 index: &'i Index,
103 indexer_config: &'a IndexerConfig,
104 config: IndexDocumentsConfig,
105 progress: FP,
106 should_abort: FA,
107 ) -> Result<IndexDocuments<'t, 'i, 'a, FP, FA>> {
108 let transform = Some(Transform::new(
109 wtxn,
110 index,
111 indexer_config,
112 config.update_method,
113 config.autogenerate_docids,
114 )?);
115
116 Ok(IndexDocuments {
117 transform,
118 config,
119 indexer_config,
120 progress,
121 should_abort,
122 wtxn,
123 index,
124 added_documents: 0,
125 deleted_documents: 0,
126 embedders: Default::default(),
127 })
128 }
129
130 #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
137 pub fn add_documents<R: Read + Seek>(
138 mut self,
139 reader: DocumentsBatchReader<R>,
140 ) -> Result<(Self, StdResult<u64, UserError>)> {
141 if reader.is_empty() {
143 return Ok((self, Ok(0)));
144 }
145
146 let enriched_documents_reader = match enrich_documents_batch(
150 self.wtxn,
151 self.index,
152 self.config.autogenerate_docids,
153 reader,
154 )? {
155 Ok(reader) => reader,
156 Err(user_error) => return Ok((self, Err(user_error))),
157 };
158
159 let indexed_documents =
160 self.transform.as_mut().expect("Invalid document addition state").read_documents(
161 enriched_documents_reader,
162 self.wtxn,
163 &self.progress,
164 &self.should_abort,
165 )? as u64;
166
167 self.added_documents += indexed_documents;
168
169 Ok((self, Ok(indexed_documents)))
170 }
171
172 pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
173 self.embedders = embedders;
174 self
175 }
176
177 #[tracing::instrument(
178 level = "trace"
179 skip_all,
180 target = "indexing::documents",
181 name = "index_documents"
182 )]
183 pub fn execute(mut self) -> Result<DocumentAdditionResult> {
184 if self.added_documents == 0 && self.deleted_documents == 0 {
185 let number_of_documents = self.index.number_of_documents(self.wtxn)?;
186 return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
187 }
188 let output = self
189 .transform
190 .take()
191 .expect("Invalid document addition state")
192 .output_from_sorter(self.wtxn, &self.progress)?;
193
194 let indexed_documents = output.documents_count as u64;
195 let number_of_documents = self.execute_raw(output)?;
196
197 Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
198 }
199
200 #[tracing::instrument(
202 level = "trace",
203 skip_all,
204 target = "indexing::details",
205 name = "index_documents_raw"
206 )]
207 pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
208 where
209 FP: Fn(UpdateIndexingStep) + Sync,
210 FA: Fn() -> bool + Sync,
211 {
212 let TransformOutput {
213 primary_key,
214 mut settings_diff,
215 field_distribution,
216 documents_count,
217 original_documents,
218 flattened_documents,
219 } = output;
220
221 settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
224
225 let settings_diff = Arc::new(settings_diff);
226 let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
227
228 let possible_embedding_mistakes =
229 crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
230
231 let backup_pool;
232 let pool = match self.indexer_config.thread_pool {
233 Some(ref pool) => pool,
234 None => {
235 #[allow(unused_mut)]
238 let mut pool_builder = ThreadPoolNoAbortBuilder::new();
239
240 #[cfg(test)]
241 {
242 pool_builder = pool_builder.num_threads(1);
243 }
244
245 backup_pool = pool_builder.build()?;
246 &backup_pool
247 }
248 };
249
250 let (lmdb_writer_sx, lmdb_writer_rx): (
252 Sender<Result<TypedChunk>>,
253 Receiver<Result<TypedChunk>>,
254 ) = crossbeam_channel::unbounded();
255
256 let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
258
259 let pool_params = GrenadParameters {
260 chunk_compression_type: self.indexer_config.chunk_compression_type,
261 chunk_compression_level: self.indexer_config.chunk_compression_level,
262 max_memory: self.indexer_config.max_memory,
263 max_nb_chunks: self.indexer_config.max_nb_chunks, };
265 let documents_chunk_size = match self.indexer_config.documents_chunk_size {
266 Some(chunk_size) => chunk_size,
267 None => {
268 let default_chunk_size = 1024 * 1024 * 4; let min_chunk_size = 1024 * 512; let total_size = match flattened_documents.as_ref() {
273 Some(flattened_documents) => flattened_documents.metadata().map(|m| m.len()),
274 None => Ok(default_chunk_size as u64),
275 };
276 let current_num_threads = pool.current_num_threads();
277 let chunk_count = if current_num_threads > 2 {
279 (current_num_threads * 3 / 4).max(2)
280 } else {
281 current_num_threads
282 };
283 total_size
284 .map_or(default_chunk_size, |size| (size as usize) / chunk_count)
285 .max(min_chunk_size)
286 }
287 };
288
289 let original_documents = match original_documents {
290 Some(original_documents) => Some(grenad::Reader::new(original_documents)?),
291 None => None,
292 };
293 let flattened_documents = match flattened_documents {
294 Some(flattened_documents) => Some(grenad::Reader::new(flattened_documents)?),
295 None => None,
296 };
297
298 let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
299
300 let mut final_documents_ids = RoaringBitmap::new();
301 let mut databases_seen = 0;
302 let mut word_position_docids = None;
303 let mut word_fid_docids = None;
304 let mut word_docids = None;
305 let mut exact_word_docids = None;
306 let mut chunk_accumulator = ChunkAccumulator::default();
307 let mut dimension = HashMap::new();
308
309 let current_span = tracing::Span::current();
310
311 let mut modified_docids = RoaringBitmap::new();
313 pool.install(|| {
314 let settings_diff_cloned = settings_diff.clone();
315 rayon::spawn(move || {
316 let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
317 let _enter = child_span.enter();
318
319 let original_chunk_iter = match original_documents {
321 Some(original_documents) => {
322 grenad_obkv_into_chunks(original_documents,pool_params,documents_chunk_size).map(either::Left)
323 },
324 None => Ok(either::Right(iter::empty())),
325 };
326
327 let flattened_chunk_iter = match flattened_documents {
329 Some(flattened_documents) => {
330 grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size).map(either::Left)
331 },
332 None => Ok(either::Right(iter::empty())),
333 };
334
335 let result = original_chunk_iter.and_then(|original_chunk| {
336 let flattened_chunk = flattened_chunk_iter?;
337 extract::data_from_obkv_documents(
339 original_chunk,
340 flattened_chunk,
341 pool_params,
342 lmdb_writer_sx.clone(),
343 primary_key_id,
344 embedders_configs.clone(),
345 settings_diff_cloned,
346 max_positions_per_attributes,
347 Arc::new(possible_embedding_mistakes)
348 )
349 });
350
351 if let Err(e) = result {
352 let _ = lmdb_writer_sx.send(Err(e));
353 }
354
355 drop(lmdb_writer_sx);
357 });
358
359 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
360 databases_seen,
361 total_databases: TOTAL_POSTING_DATABASE_COUNT,
362 });
363
364 loop {
365 if (self.should_abort)() {
366 return Err(Error::InternalError(InternalError::AbortedIndexation));
367 }
368
369 match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
370 Err(status) => {
371 if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
372 let (docids, is_merged_database) =
373 write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?;
374 if !docids.is_empty() {
375 final_documents_ids |= docids;
376 let documents_seen_count = final_documents_ids.len();
377 (self.progress)(UpdateIndexingStep::IndexDocuments {
378 documents_seen: documents_seen_count as usize,
379 total_documents: documents_count,
380 });
381 debug!(documents = documents_seen_count, total = documents_count, "Seen");
382 }
383 if is_merged_database {
384 databases_seen += 1;
385 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
386 databases_seen,
387 total_databases: TOTAL_POSTING_DATABASE_COUNT,
388 });
389 }
390 } else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
392 break;
393 } else {
394 rayon::yield_now();
395 }
396 }
397 Ok(result) => {
398 let typed_chunk = match result? {
399 TypedChunk::WordDocids {
400 word_docids_reader,
401 exact_word_docids_reader,
402 word_fid_docids_reader,
403 } => {
404 let cloneable_chunk =
405 unsafe { as_cloneable_grenad(&word_docids_reader)? };
406 let word_docids = word_docids.get_or_insert_with(|| {
407 MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
408 });
409 word_docids.push(cloneable_chunk.into_cursor()?);
410 let cloneable_chunk =
411 unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
412 let exact_word_docids =
413 exact_word_docids.get_or_insert_with(|| {
414 MergerBuilder::new(
415 MergeDeladdCboRoaringBitmaps,
416 )
417 });
418 exact_word_docids.push(cloneable_chunk.into_cursor()?);
419 let cloneable_chunk =
420 unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
421 let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
422 MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
423 });
424 word_fid_docids.push(cloneable_chunk.into_cursor()?);
425 TypedChunk::WordDocids {
426 word_docids_reader,
427 exact_word_docids_reader,
428 word_fid_docids_reader,
429 }
430 }
431 TypedChunk::WordPositionDocids(chunk) => {
432 let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
433 let word_position_docids =
434 word_position_docids.get_or_insert_with(|| {
435 MergerBuilder::new(
436 MergeDeladdCboRoaringBitmaps,
437 )
438 });
439 word_position_docids.push(cloneable_chunk.into_cursor()?);
440 TypedChunk::WordPositionDocids(chunk)
441 }
442 TypedChunk::VectorPoints {
443 expected_dimension,
444 remove_vectors,
445 embeddings,
446 manual_vectors,
447 embedder_name,
448 add_to_user_provided,
449 remove_from_user_provided,
450 } => {
451 dimension.insert(embedder_name.clone(), expected_dimension);
452 TypedChunk::VectorPoints {
453 remove_vectors,
454 embeddings,
455 expected_dimension,
456 manual_vectors,
457 embedder_name,
458 add_to_user_provided,
459 remove_from_user_provided,
460 }
461 }
462 otherwise => otherwise,
463 };
464
465 chunk_accumulator.insert(typed_chunk);
466 }
467 }
468 }
469
470 if settings_diff.settings_update_only() {
472 clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?;
473 }
474
475 Ok(())
476 }).map_err(InternalError::from)??;
477
478 if !settings_diff.settings_update_only {
479 let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?;
481 self.index.put_documents_stats(self.wtxn, stats)?;
482 }
483 self.index.put_field_distribution(self.wtxn, &field_distribution)?;
485
486 self.index.put_primary_key(self.wtxn, &primary_key)?;
488 let number_of_documents = self.index.number_of_documents(self.wtxn)?;
489 let mut rng = rand::rngs::StdRng::seed_from_u64(42);
490
491 for (name, action) in settings_diff.embedding_config_updates.iter() {
494 if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
495 let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or(
496 InternalError::DatabaseMissingEntry {
497 db_name: "embedder_category_id",
498 key: None,
499 },
500 )?;
501 let reader =
502 ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
503 let dim = reader.dimensions(self.wtxn)?;
504 dimension.insert(name.to_string(), dim);
505 }
506 }
507
508 for (embedder_name, dimension) in dimension {
509 let wtxn = &mut *self.wtxn;
510 let vector_arroy = self.index.vector_arroy;
511 let cancel = &self.should_abort;
512
513 let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
514 InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
515 )?;
516 let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name);
517 let was_quantized =
518 settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2);
519 let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
520
521 pool.install(|| {
522 let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
523 writer.build_and_quantize(
524 wtxn,
525 &Progress::default(),
527 &mut rng,
528 dimension,
529 is_quantizing,
530 self.indexer_config.max_memory,
531 cancel,
532 )?;
533 Result::Ok(())
534 })
535 .map_err(InternalError::from)??;
536 }
537
538 self.execute_prefix_databases(
539 word_docids.map(MergerBuilder::build),
540 exact_word_docids.map(MergerBuilder::build),
541 word_position_docids.map(MergerBuilder::build),
542 word_fid_docids.map(MergerBuilder::build),
543 )?;
544
545 Ok(number_of_documents)
546 }
547
548 #[tracing::instrument(
549 level = "trace",
550 skip_all,
551 target = "indexing::prefix",
552 name = "index_documents_prefix_databases"
553 )]
554 pub fn execute_prefix_databases(
555 self,
556 word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
557 exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
558 word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
559 word_fid_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
560 ) -> Result<()>
561 where
562 FP: Fn(UpdateIndexingStep) + Sync,
563 FA: Fn() -> bool + Sync,
564 {
565 let mut databases_seen = MERGED_DATABASE_COUNT;
567
568 if (self.should_abort)() {
569 return Err(Error::InternalError(InternalError::AbortedIndexation));
570 }
571
572 databases_seen += 1;
573 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
574 databases_seen,
575 total_databases: TOTAL_POSTING_DATABASE_COUNT,
576 });
577
578 if (self.should_abort)() {
579 return Err(Error::InternalError(InternalError::AbortedIndexation));
580 }
581
582 let previous_words_prefixes_fst =
583 self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;
584
585 let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } =
587 self.index.prefix_settings(self.wtxn)?;
588
589 if compute_prefixes == PrefixSearch::IndexingTime {
591 let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
592 builder.threshold(prefix_count_threshold);
593 builder.max_prefix_length(max_prefix_length);
594 builder.execute()?;
595 } else {
596 self.index.delete_words_prefixes_fst(self.wtxn)?;
599 self.index.word_prefix_docids.clear(self.wtxn)?;
600 self.index.exact_word_prefix_docids.clear(self.wtxn)?;
601 self.index.word_prefix_position_docids.clear(self.wtxn)?;
602 self.index.word_prefix_fid_docids.clear(self.wtxn)?;
603
604 databases_seen += 3;
605 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
606 databases_seen,
607 total_databases: TOTAL_POSTING_DATABASE_COUNT,
608 });
609
610 return Ok(());
611 }
612
613 if (self.should_abort)() {
614 return Err(Error::InternalError(InternalError::AbortedIndexation));
615 }
616
617 let current_prefix_fst;
618 let common_prefix_fst_words_tmp;
619 let common_prefix_fst_words: Vec<_>;
620 let new_prefix_fst_words;
621 let del_prefix_fst_words;
622
623 {
624 let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs");
625 let _entered = span.enter();
626
627 current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
628
629 common_prefix_fst_words_tmp = fst_stream_into_vec(
631 previous_words_prefixes_fst.op().add(¤t_prefix_fst).intersection(),
632 );
633 common_prefix_fst_words = common_prefix_fst_words_tmp
634 .as_slice()
635 .linear_group_by_key(|x| x.chars().next().unwrap())
636 .collect();
637
638 new_prefix_fst_words = fst_stream_into_vec(
640 current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
641 );
642
643 del_prefix_fst_words = fst_stream_into_hashset(
645 previous_words_prefixes_fst.op().add(¤t_prefix_fst).difference(),
646 );
647 }
648
649 databases_seen += 1;
650 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
651 databases_seen,
652 total_databases: TOTAL_POSTING_DATABASE_COUNT,
653 });
654
655 if (self.should_abort)() {
656 return Err(Error::InternalError(InternalError::AbortedIndexation));
657 }
658
659 if let Some(word_docids) = word_docids {
660 execute_word_prefix_docids(
661 self.wtxn,
662 word_docids,
663 self.index.word_docids,
664 self.index.word_prefix_docids,
665 self.indexer_config,
666 &new_prefix_fst_words,
667 &common_prefix_fst_words,
668 &del_prefix_fst_words,
669 )?;
670 }
671
672 if let Some(exact_word_docids) = exact_word_docids {
673 execute_word_prefix_docids(
674 self.wtxn,
675 exact_word_docids,
676 self.index.exact_word_docids,
677 self.index.exact_word_prefix_docids,
678 self.indexer_config,
679 &new_prefix_fst_words,
680 &common_prefix_fst_words,
681 &del_prefix_fst_words,
682 )?;
683 }
684
685 if (self.should_abort)() {
686 return Err(Error::InternalError(InternalError::AbortedIndexation));
687 }
688
689 databases_seen += 1;
690 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
691 databases_seen,
692 total_databases: TOTAL_POSTING_DATABASE_COUNT,
693 });
694
695 if let Some(word_position_docids) = word_position_docids {
696 let mut builder = WordPrefixIntegerDocids::new(
698 self.wtxn,
699 self.index.word_prefix_position_docids,
700 self.index.word_position_docids,
701 );
702 builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
703 builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
704 builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
705 builder.max_memory = self.indexer_config.max_memory;
706
707 builder.execute(
708 word_position_docids,
709 &new_prefix_fst_words,
710 &common_prefix_fst_words,
711 &del_prefix_fst_words,
712 )?;
713 }
714 if let Some(word_fid_docids) = word_fid_docids {
715 let mut builder = WordPrefixIntegerDocids::new(
717 self.wtxn,
718 self.index.word_prefix_fid_docids,
719 self.index.word_fid_docids,
720 );
721 builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
722 builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
723 builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
724 builder.max_memory = self.indexer_config.max_memory;
725 builder.execute(
726 word_fid_docids,
727 &new_prefix_fst_words,
728 &common_prefix_fst_words,
729 &del_prefix_fst_words,
730 )?;
731 }
732
733 if (self.should_abort)() {
734 return Err(Error::InternalError(InternalError::AbortedIndexation));
735 }
736
737 databases_seen += 1;
738 (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
739 databases_seen,
740 total_databases: TOTAL_POSTING_DATABASE_COUNT,
741 });
742
743 Ok(())
744 }
745}
746
747#[allow(clippy::too_many_arguments)]
749#[tracing::instrument(
750 level = "trace",
751 skip_all,
752 target = "indexing::prefix",
753 name = "index_documents_word_prefix_docids"
754)]
755fn execute_word_prefix_docids(
756 txn: &mut heed::RwTxn<'_>,
757 merger: Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
758 word_docids_db: Database<Str, CboRoaringBitmapCodec>,
759 word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
760 indexer_config: &IndexerConfig,
761 new_prefix_fst_words: &[String],
762 common_prefix_fst_words: &[&[String]],
763 del_prefix_fst_words: &HashSet<Vec<u8>>,
764) -> Result<()> {
765 let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
766 builder.chunk_compression_type = indexer_config.chunk_compression_type;
767 builder.chunk_compression_level = indexer_config.chunk_compression_level;
768 builder.max_nb_chunks = indexer_config.max_nb_chunks;
769 builder.max_memory = indexer_config.max_memory;
770 builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
771 Ok(())
772}
773
774#[cfg(test)]
775mod tests {
776 use std::collections::BTreeMap;
777
778 use big_s::S;
779 use bumpalo::Bump;
780 use fst::IntoStreamer;
781 use heed::RwTxn;
782 use maplit::hashset;
783
784 use super::*;
785 use crate::constants::RESERVED_GEO_FIELD_NAME;
786 use crate::documents::mmap_from_objects;
787 use crate::index::tests::TempIndex;
788 use crate::index::IndexEmbeddingConfig;
789 use crate::progress::Progress;
790 use crate::search::TermsMatchingStrategy;
791 use crate::update::new::indexer;
792 use crate::update::Setting;
793 use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError};
794
795 #[test]
796 fn simple_document_replacement() {
797 let index = TempIndex::new();
798
799 index
801 .add_documents(documents!([
802 { "id": 1, "name": "kevin" },
803 { "id": 2, "name": "kevina" },
804 { "id": 3, "name": "benoit" }
805 ]))
806 .unwrap();
807
808 let rtxn = index.read_txn().unwrap();
810 let count = index.number_of_documents(&rtxn).unwrap();
811 assert_eq!(count, 3);
812 drop(rtxn);
813
814 index.add_documents(documents!([ { "id": 1, "name": "updated kevin" } ])).unwrap();
816
817 let rtxn = index.read_txn().unwrap();
819 let count = index.number_of_documents(&rtxn).unwrap();
820 assert_eq!(count, 3);
821 drop(rtxn);
822
823 index
825 .add_documents(documents!([
826 { "id": 1, "name": "updated second kevin" },
827 { "id": 2, "name": "updated kevina" },
828 { "id": 3, "name": "updated benoit" }
829 ]))
830 .unwrap();
831
832 let rtxn = index.read_txn().unwrap();
834 let count = index.number_of_documents(&rtxn).unwrap();
835 assert_eq!(count, 3);
836 let count = index.all_documents(&rtxn).unwrap().count();
837 assert_eq!(count, 3);
838
839 drop(rtxn);
840 }
841
842 #[test]
843 fn simple_document_merge() {
844 let mut index = TempIndex::new();
845 index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
846
847 index
850 .add_documents(documents!([
851 { "id": 1, "name": "kevin" },
852 { "id": 1, "name": "kevina" },
853 { "id": 1, "name": "benoit" }
854 ]))
855 .unwrap();
856
857 let rtxn = index.read_txn().unwrap();
859 let count = index.number_of_documents(&rtxn).unwrap();
860 assert_eq!(count, 1);
861
862 let docs = index.documents(&rtxn, Some(0)).unwrap();
864 assert_eq!(docs.len(), 1);
865 let (id, doc) = docs[0];
866 assert_eq!(id, 0);
867
868 let mut doc_iter = doc.iter();
870 assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
871 assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
872 assert_eq!(doc_iter.next(), None);
873 drop(rtxn);
874
875 index.add_documents(documents!([ { "id": 1, "age": 25 } ])).unwrap();
877
878 let rtxn = index.read_txn().unwrap();
880 let count = index.number_of_documents(&rtxn).unwrap();
881 assert_eq!(count, 1);
882
883 let docs = index.documents(&rtxn, Some(0)).unwrap();
885 assert_eq!(docs.len(), 1);
886 let (id, doc) = docs[0];
887 assert_eq!(id, 0);
888
889 let mut doc_iter = doc.iter();
891 assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
892 assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
893 assert_eq!(doc_iter.next(), Some((2, &b"25"[..])));
894 assert_eq!(doc_iter.next(), None);
895 drop(rtxn);
896 }
897
898 #[test]
899 fn empty_update() {
900 let index = TempIndex::new();
901
902 index.add_documents(documents!([])).unwrap();
904
905 let rtxn = index.read_txn().unwrap();
907 let count = index.number_of_documents(&rtxn).unwrap();
908 assert_eq!(count, 0);
909 drop(rtxn);
910 }
911
912 #[test]
913 fn invalid_documents_ids() {
914 let index = TempIndex::new();
915
916 index.add_documents(documents!([ { "id": "brume bleue", "name": "kevin" } ])).unwrap_err();
919
920 index.add_documents(documents!([ { "id": 32, "name": "kevin" } ])).unwrap();
922
923 let rtxn = index.read_txn().unwrap();
925 let count = index.number_of_documents(&rtxn).unwrap();
926 assert_eq!(count, 1);
927 drop(rtxn);
928 }
929
930 #[test]
931 fn complex_documents() {
932 let index = TempIndex::new();
933
934 index
936 .add_documents(documents!([
937 { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
938 { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
939 { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
940 ]))
941 .unwrap();
942
943 let rtxn = index.read_txn().unwrap();
945
946 let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
948 assert_eq!(result.documents_ids, vec![0]);
949
950 let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
952 assert_eq!(result.documents_ids, vec![1]);
953
954 let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap();
956 assert_eq!(result.documents_ids, vec![2]);
957
958 drop(rtxn);
959 }
960
961 #[test]
962 fn simple_documents_replace() {
963 let mut index = TempIndex::new();
964 index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
965
966 index.add_documents(documents!([
967 { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 42 } },
968 { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
969 { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 },
970 { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" },
971 { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" },
972 { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", RESERVED_GEO_FIELD_NAME: { "lat": 35, "lng": 23 } }
973 ])).unwrap();
974
975 db_snap!(index, word_docids, "initial");
976
977 index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
978
979 index
980 .add_documents(documents!([
981 {"id":4,"title":"Harry Potter and the Half-Blood Princess"},
982 {"id":456,"title":"The Little Prince"}
983 ]))
984 .unwrap();
985
986 index
987 .add_documents(documents!([
988 { "id": 2, "author": "J. Austen", "date": "1813" }
989 ]))
990 .unwrap();
991
992 let rtxn = index.read_txn().unwrap();
994 let count = index.number_of_documents(&rtxn).unwrap();
995 assert_eq!(count, 6);
996 let count = index.all_documents(&rtxn).unwrap().count();
997 assert_eq!(count, 6);
998
999 db_snap!(index, word_docids, "updated");
1000
1001 drop(rtxn);
1002 }
1003
1004 #[test]
1005 fn mixed_geo_documents() {
1006 let mut index = TempIndex::new();
1007 index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
1008
1009 index
1011 .add_documents(documents!([
1012 { "id": 2, "price": 3.5, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 42 } },
1013 { "id": 456 },
1014 { "id": 1 },
1015 { "id": 1344 },
1016 { "id": 4 },
1017 { "id": 42, RESERVED_GEO_FIELD_NAME: { "lat": 35, "lng": 23 } }
1018 ]))
1019 .unwrap();
1020
1021 index
1022 .update_settings(|settings| {
1023 settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
1024 RESERVED_GEO_FIELD_NAME.to_string(),
1025 )]);
1026 })
1027 .unwrap();
1028 }
1029
1030 #[test]
1031 fn geo_error() {
1032 let mut index = TempIndex::new();
1033 index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
1034
1035 index
1036 .update_settings(|settings| {
1037 settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
1038 RESERVED_GEO_FIELD_NAME.to_string(),
1039 )]);
1040 })
1041 .unwrap();
1042
1043 let error = index
1044 .add_documents(documents!([
1045 { "id": 0, RESERVED_GEO_FIELD_NAME: { "lng": 42 } }
1046 ]))
1047 .unwrap_err();
1048 assert_eq!(
1049 &error.to_string(),
1050 r#"Could not find latitude in the document with the id: `"0"`. Was expecting a `_geo.lat` field."#
1051 );
1052
1053 let error = index
1054 .add_documents(documents!([
1055 { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 42 } }
1056 ]))
1057 .unwrap_err();
1058 assert_eq!(
1059 &error.to_string(),
1060 r#"Could not find longitude in the document with the id: `"0"`. Was expecting a `_geo.lng` field."#
1061 );
1062
1063 let error = index
1064 .add_documents(documents!([
1065 { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": "lol", "lng": 42 } }
1066 ]))
1067 .unwrap_err();
1068 assert_eq!(
1069 &error.to_string(),
1070 r#"Could not parse latitude in the document with the id: `"0"`. Was expecting a finite number but instead got `"lol"`."#
1071 );
1072
1073 let error = index
1074 .add_documents(documents!([
1075 { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": [12, 13], "lng": 42 } }
1076 ]))
1077 .unwrap_err();
1078 assert_eq!(
1079 &error.to_string(),
1080 r#"Could not parse latitude in the document with the id: `"0"`. Was expecting a finite number but instead got `[12,13]`."#
1081 );
1082
1083 let error = index
1084 .add_documents(documents!([
1085 { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": "hello" } }
1086 ]))
1087 .unwrap_err();
1088 assert_eq!(
1089 &error.to_string(),
1090 r#"Could not parse longitude in the document with the id: `"0"`. Was expecting a finite number but instead got `"hello"`."#
1091 );
1092 }
1093
1094 #[test]
1095 fn delete_documents_then_insert() {
1096 let index = TempIndex::new();
1097
1098 index
1099 .add_documents(documents!([
1100 { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
1101 { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" },
1102 { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" },
1103 { "objectId": 30, "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } }
1104 ]))
1105 .unwrap();
1106
1107 index.delete_document("30");
1109
1110 let txn = index.read_txn().unwrap();
1111 assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
1112
1113 let external_documents_ids = index.external_documents_ids();
1114 assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
1115
1116 index
1117 .add_documents(documents!([
1118 { "objectId": 30, "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } }
1119 ]))
1120 .unwrap();
1121
1122 let wtxn = index.write_txn().unwrap();
1123 let external_documents_ids = index.external_documents_ids();
1124 assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
1125 wtxn.commit().unwrap();
1126
1127 index
1128 .add_documents(documents!([
1129 { "objectId": 30, "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } }
1130 ]))
1131 .unwrap();
1132 }
1133
1134 #[test]
1135 fn index_more_than_256_fields() {
1136 let index = TempIndex::new();
1137
1138 let mut big_object = serde_json::Map::new();
1139 big_object.insert(S("id"), serde_json::Value::from("wow"));
1140 for i in 0..1000 {
1141 let key = i.to_string();
1142 big_object.insert(key, serde_json::Value::from("I am a text!"));
1143 }
1144
1145 let documents = mmap_from_objects([big_object]);
1146 index.add_documents(documents).unwrap();
1147 }
1148
1149 #[test]
1150 fn index_more_than_1000_positions_in_a_field() {
1151 let index = TempIndex::new_with_map_size(4096 * 100_000); let mut content = String::with_capacity(382101);
1153 for i in 0..=u16::MAX {
1154 content.push_str(&format!("{i} "));
1155 }
1156 index
1157 .add_documents(documents!({
1158 "id": "wow",
1159 "content": content
1160 }))
1161 .unwrap();
1162
1163 let rtxn = index.read_txn().unwrap();
1164
1165 assert!(index.word_docids.get(&rtxn, "0").unwrap().is_some());
1166 assert!(index.word_docids.get(&rtxn, "64").unwrap().is_some());
1167 assert!(index.word_docids.get(&rtxn, "256").unwrap().is_some());
1168 assert!(index.word_docids.get(&rtxn, "1024").unwrap().is_some());
1169 assert!(index.word_docids.get(&rtxn, "32768").unwrap().is_some());
1170 assert!(index.word_docids.get(&rtxn, "65535").unwrap().is_some());
1171 }
1172
1173 #[test]
1174 fn index_documents_with_zeroes() {
1175 let index = TempIndex::new();
1176
1177 index
1178 .add_documents(documents!([
1179 {
1180 "id": 2,
1181 "title": "Prideand Prejudice",
1182 "au{hor": "Jane Austin",
1183 "genre": "romance",
1184 "price$": "3.5$",
1185 },
1186 {
1187 "id": 456,
1188 "title": "Le Petit Prince",
1189 "au{hor": "Antoine de Saint-Exupéry",
1190 "genre": "adventure",
1191 "price$": "10.0$",
1192 },
1193 {
1194 "id": 1,
1195 "title": "Wonderland",
1196 "au{hor": "Lewis Carroll",
1197 "genre": "fantasy",
1198 "price$": "25.99$",
1199 },
1200 {
1201 "id": 4,
1202 "title": "Harry Potter ing fantasy\0lood Prince",
1203 "au{hor": "J. K. Rowling",
1204 "genre": "fantasy\0",
1205 },
1206 ]))
1207 .unwrap();
1208 }
1209
1210 #[test]
1211 fn index_documents_with_nested_fields() {
1212 let index = TempIndex::new();
1213
1214 index
1215 .add_documents(documents!([
1216 {
1217 "id": 0,
1218 "title": "The zeroth document",
1219 },
1220 {
1221 "id": 1,
1222 "title": "The first document",
1223 "nested": {
1224 "object": "field",
1225 "machin": "bidule",
1226 },
1227 },
1228 {
1229 "id": 2,
1230 "title": "The second document",
1231 "nested": [
1232 "array",
1233 {
1234 "object": "field",
1235 },
1236 {
1237 "prout": "truc",
1238 "machin": "lol",
1239 },
1240 ],
1241 },
1242 {
1243 "id": 3,
1244 "title": "The third document",
1245 "nested": "I lied",
1246 },
1247 ]))
1248 .unwrap();
1249
1250 index
1251 .update_settings(|settings| {
1252 let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")];
1253 settings.set_searchable_fields(searchable_fields);
1254
1255 let faceted_fields = vec![
1256 FilterableAttributesRule::Field("title".to_string()),
1257 FilterableAttributesRule::Field("nested.object".to_string()),
1258 FilterableAttributesRule::Field("nested.machin".to_string()),
1259 ];
1260 settings.set_filterable_fields(faceted_fields);
1261 })
1262 .unwrap();
1263
1264 let rtxn = index.read_txn().unwrap();
1265
1266 let mut search = crate::Search::new(&rtxn, &index);
1268 search.query("document");
1269 search.terms_matching_strategy(TermsMatchingStrategy::default());
1270 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1272 assert_eq!(documents_ids.len(), 4);
1273
1274 search.query("zeroth");
1275 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1276 assert_eq!(documents_ids, vec![0]);
1277 search.query("first");
1278 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1279 assert_eq!(documents_ids, vec![1]);
1280 search.query("second");
1281 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1282 assert_eq!(documents_ids, vec![2]);
1283 search.query("third");
1284 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1285 assert_eq!(documents_ids, vec![3]);
1286
1287 search.query("field");
1288 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1289 assert_eq!(documents_ids, vec![1, 2]);
1290
1291 search.query("lol");
1292 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1293 assert_eq!(documents_ids, vec![2]);
1294
1295 search.query("object");
1296 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1297 assert!(documents_ids.is_empty());
1298
1299 search.query("array");
1300 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1301 assert!(documents_ids.is_empty()); search.query("lied");
1304 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1305 assert!(documents_ids.is_empty()); let mut search = crate::Search::new(&rtxn, &index);
1309 search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap());
1310 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1311 assert_eq!(documents_ids, vec![1]);
1312
1313 search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap());
1314 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1315 assert_eq!(documents_ids, vec![1, 2]);
1316
1317 search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap());
1318 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1319 assert_eq!(documents_ids, vec![1]);
1320
1321 search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap());
1322 let error = search.execute().map(|_| unreachable!()).unwrap_err(); assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
1324
1325 search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap());
1326 let error = search.execute().map(|_| unreachable!()).unwrap_err(); assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
1328 }
1329
1330 #[test]
1331 fn index_documents_with_nested_primary_key() {
1332 let index = TempIndex::new();
1333
1334 index
1335 .update_settings(|settings| {
1336 settings.set_primary_key("complex.nested.id".to_owned());
1337 })
1338 .unwrap();
1339
1340 index
1341 .add_documents(documents!([
1342 {
1343 "complex": {
1344 "nested": {
1345 "id": 0,
1346 },
1347 },
1348 "title": "The zeroth document",
1349 },
1350 {
1351 "complex.nested": {
1352 "id": 1,
1353 },
1354 "title": "The first document",
1355 },
1356 {
1357 "complex": {
1358 "nested.id": 2,
1359 },
1360 "title": "The second document",
1361 },
1362 {
1363 "complex.nested.id": 3,
1364 "title": "The third document",
1365 },
1366 ]))
1367 .unwrap();
1368
1369 let rtxn = index.read_txn().unwrap();
1370
1371 let mut search = crate::Search::new(&rtxn, &index);
1373 search.query("document");
1374 search.terms_matching_strategy(TermsMatchingStrategy::default());
1375 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1377 assert_eq!(documents_ids.len(), 4);
1378
1379 search.query("zeroth");
1380 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1381 assert_eq!(documents_ids, vec![0]);
1382 search.query("first");
1383 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1384 assert_eq!(documents_ids, vec![1]);
1385 search.query("second");
1386 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1387 assert_eq!(documents_ids, vec![2]);
1388 search.query("third");
1389 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1390 assert_eq!(documents_ids, vec![3]);
1391 }
1392
1393 #[test]
1394 fn retrieve_a_b_nested_document_id() {
1395 let index = TempIndex::new();
1396
1397 index
1398 .update_settings(|settings| {
1399 settings.set_primary_key("a.b".to_owned());
1400 })
1401 .unwrap();
1402
1403 index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap_err();
1405 }
1406
1407 #[test]
1408 fn retrieve_a_b_c_nested_document_id() {
1409 let index = TempIndex::new();
1410
1411 index
1412 .update_settings(|settings| {
1413 settings.set_primary_key("a.b.c".to_owned());
1414 })
1415 .unwrap();
1416 index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
1417
1418 let rtxn = index.read_txn().unwrap();
1419 let all_documents_count = index.all_documents(&rtxn).unwrap().count();
1420 assert_eq!(all_documents_count, 1);
1421 let external_documents_ids = index.external_documents_ids();
1422 assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
1423 }
1424
1425 #[test]
1426 fn test_facets_generation() {
1427 let index = TempIndex::new();
1428
1429 index
1430 .add_documents(documents!([
1431 {
1432 "id": 0,
1433 "dog": {
1434 "race": {
1435 "bernese mountain": "zeroth",
1436 },
1437 },
1438 },
1439 {
1440 "id": 1,
1441 "dog.race": {
1442 "bernese mountain": "first",
1443 },
1444 },
1445 {
1446 "id": 2,
1447 "dog.race.bernese mountain": "second",
1448 },
1449 {
1450 "id": 3,
1451 "dog": {
1452 "race.bernese mountain": "third"
1453 },
1454 },
1455 ]))
1456 .unwrap();
1457
1458 index
1459 .update_settings(|settings| {
1460 settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
1461 "dog".to_string(),
1462 )]);
1463 })
1464 .unwrap();
1465
1466 db_snap!(index, facet_id_string_docids, @r###"
1467 3 0 first 1 [1, ]
1468 3 0 second 1 [2, ]
1469 3 0 third 1 [3, ]
1470 3 0 zeroth 1 [0, ]
1471 "###);
1472 db_snap!(index, field_id_docid_facet_strings, @r###"
1473 3 0 zeroth zeroth
1474 3 1 first first
1475 3 2 second second
1476 3 3 third third
1477 "###);
1478
1479 let rtxn = index.read_txn().unwrap();
1480
1481 for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] {
1482 let mut search = crate::Search::new(&rtxn, &index);
1483 let filter = format!(r#""dog.race.bernese mountain" = {s}"#);
1484 search.filter(crate::Filter::from_str(&filter).unwrap().unwrap());
1485 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1486 assert_eq!(documents_ids, vec![i]);
1487 }
1488 index
1490 .update_settings(|settings| {
1491 settings.reset_filterable_fields();
1492 })
1493 .unwrap();
1494
1495 db_snap!(index, facet_id_string_docids, @"");
1496 db_snap!(index, field_id_docid_facet_strings, @"");
1497
1498 index
1500 .update_settings(|settings| {
1501 settings.set_sortable_fields(hashset!(S("dog.race")));
1502 })
1503 .unwrap();
1504
1505 db_snap!(index, facet_id_string_docids, @r###"
1506 3 0 first 1 [1, ]
1507 3 0 second 1 [2, ]
1508 3 0 third 1 [3, ]
1509 3 0 zeroth 1 [0, ]
1510 "###);
1511 db_snap!(index, field_id_docid_facet_strings, @r###"
1512 3 0 zeroth zeroth
1513 3 1 first first
1514 3 2 second second
1515 3 3 third third
1516 "###);
1517
1518 let rtxn = index.read_txn().unwrap();
1519
1520 let mut search = crate::Search::new(&rtxn, &index);
1521 search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S(
1522 "dog.race.bernese mountain",
1523 )))]);
1524 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1525 assert_eq!(documents_ids, vec![1, 2, 3, 0]);
1526 }
1527
1528 #[test]
1529 fn index_2_times_documents_split_by_zero_document_indexation() {
1530 let index = TempIndex::new();
1531
1532 index
1533 .add_documents(documents!([
1534 {"id": 0, "name": "Kerollmops", "score": 78},
1535 {"id": 1, "name": "ManyTheFish", "score": 75},
1536 {"id": 2, "name": "Ferdi", "score": 39},
1537 {"id": 3, "name": "Tommy", "score": 33}
1538 ]))
1539 .unwrap();
1540
1541 let rtxn = index.read_txn().unwrap();
1543 let count = index.number_of_documents(&rtxn).unwrap();
1544 assert_eq!(count, 4);
1545
1546 index.add_documents(documents!([])).unwrap();
1547
1548 let rtxn = index.read_txn().unwrap();
1550 let count = index.number_of_documents(&rtxn).unwrap();
1551 assert_eq!(count, 4);
1552
1553 index
1554 .add_documents(documents!([
1555 {"id": 0, "name": "Kerollmops", "score": 78},
1556 {"id": 1, "name": "ManyTheFish", "score": 75},
1557 {"id": 2, "name": "Ferdi", "score": 39},
1558 {"id": 3, "name": "Tommy", "score": 33}
1559 ]))
1560 .unwrap();
1561
1562 let rtxn = index.read_txn().unwrap();
1564 let count = index.number_of_documents(&rtxn).unwrap();
1565 assert_eq!(count, 4);
1566 }
1567
1568 #[cfg(feature = "chinese")]
1569 #[test]
1570 fn test_meilisearch_1714() {
1571 let index = TempIndex::new();
1572
1573 index
1574 .add_documents(documents!([
1575 {"id": "123", "title": "小化妆包" },
1576 {"id": "456", "title": "Ipad 包" }
1577 ]))
1578 .unwrap();
1579
1580 let rtxn = index.read_txn().unwrap();
1581
1582 let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
1584 assert_eq!(count, 1);
1585
1586 let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
1588 assert_eq!(count, 1);
1589
1590 let mut search = crate::Search::new(&rtxn, &index);
1591 search.query("化妆包");
1592 search.terms_matching_strategy(TermsMatchingStrategy::default());
1593
1594 let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
1596 assert_eq!(documents_ids.len(), 1);
1597 }
1598
1599 #[test]
1602 fn text_with_too_long_words() {
1603 let index = TempIndex::new();
1604
1605 index
1606 .add_documents(documents!([
1607 {"id": 1, "title": "a".repeat(256) },
1608 {"id": 2, "title": "b".repeat(512) },
1609 {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) },
1610 ]))
1611 .unwrap();
1612 }
1613
1614 #[test]
1615 fn text_with_too_long_keys() {
1616 let index = TempIndex::new();
1617 let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3";
1618
1619 let content = {
1621 let documents_iter = (0..200i32)
1622 .map(|i| serde_json::json!({ "id": i, "script": script }))
1623 .filter_map(|json| match json {
1624 serde_json::Value::Object(object) => Some(object),
1625 _ => None,
1626 });
1627 mmap_from_objects(documents_iter)
1628 };
1629 index.add_documents(content).unwrap();
1631
1632 index
1634 .add_documents(documents!([
1635 {"id": 400, "script": script },
1636 ]))
1637 .unwrap();
1638 }
1639
1640 #[test]
1641 fn index_documents_in_multiple_transforms() {
1642 let index = TempIndex::new();
1643
1644 let doc1 = documents! {[{
1645 "id": 228142,
1646 "title": "asdsad",
1647 "state": "automated",
1648 "priority": "normal",
1649 "public_uid": "37ccf021",
1650 "project_id": 78207,
1651 "branch_id_number": 0
1652 }]};
1653
1654 let doc2 = documents! {[{
1655 "id": 228143,
1656 "title": "something",
1657 "state": "automated",
1658 "priority": "normal",
1659 "public_uid": "39c6499b",
1660 "project_id": 78207,
1661 "branch_id_number": 0
1662 }]};
1663
1664 {
1665 let mut wtxn = index.write_txn().unwrap();
1666 index.put_primary_key(&mut wtxn, "id").unwrap();
1667 wtxn.commit().unwrap();
1668 }
1669
1670 index.add_documents(doc1).unwrap();
1671 index.add_documents(doc2).unwrap();
1672
1673 let rtxn = index.read_txn().unwrap();
1674
1675 let map = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
1676 let ids = map.values().collect::<HashSet<_>>();
1677
1678 assert_eq!(ids.len(), map.len());
1679 }
1680
1681 #[test]
1682 fn index_documents_check_exists_database() {
1683 let content = || {
1684 documents!([
1685 {
1686 "id": 0,
1687 "colour": 0,
1688 },
1689 {
1690 "id": 1,
1691 "colour": []
1692 },
1693 {
1694 "id": 2,
1695 "colour": {}
1696 },
1697 {
1698 "id": 3,
1699 "colour": null
1700 },
1701 {
1702 "id": 4,
1703 "colour": [1]
1704 },
1705 {
1706 "id": 5
1707 },
1708 {
1709 "id": 6,
1710 "colour": {
1711 "green": 1
1712 }
1713 },
1714 {
1715 "id": 7,
1716 "colour": {
1717 "green": {
1718 "blue": []
1719 }
1720 }
1721 }
1722 ])
1723 };
1724
1725 let check_ok = |index: &Index| {
1726 let rtxn = index.read_txn().unwrap();
1727
1728 let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
1729 let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
1730 let colour_green_blue_id =
1731 index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
1732
1733 let bitmap_colour =
1734 index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
1735 assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
1736
1737 let bitmap_colour_green =
1738 index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
1739 assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
1740
1741 let bitmap_colour_blue =
1742 index.facet_id_exists_docids.get(&rtxn, &colour_green_blue_id).unwrap().unwrap();
1743 assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![7]);
1744 };
1745
1746 let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())];
1747
1748 let index = TempIndex::new();
1749 index.add_documents(content()).unwrap();
1750 index
1751 .update_settings(|settings| {
1752 settings.set_filterable_fields(faceted_fields.clone());
1753 })
1754 .unwrap();
1755 check_ok(&index);
1756
1757 let index = TempIndex::new();
1758 index
1759 .update_settings(|settings| {
1760 settings.set_filterable_fields(faceted_fields.clone());
1761 })
1762 .unwrap();
1763 index.add_documents(content()).unwrap();
1764 check_ok(&index);
1765 }
1766
1767 #[test]
1768 fn index_documents_check_is_null_database() {
1769 let content = || {
1770 documents!([
1771 {
1772 "id": 0,
1773 "colour": null,
1774 },
1775 {
1776 "id": 1,
1777 "colour": [null], },
1779 {
1780 "id": 6,
1781 "colour": {
1782 "green": null
1783 }
1784 },
1785 {
1786 "id": 7,
1787 "colour": {
1788 "green": {
1789 "blue": null
1790 }
1791 }
1792 },
1793 {
1794 "id": 8,
1795 "colour": 0,
1796 },
1797 {
1798 "id": 9,
1799 "colour": []
1800 },
1801 {
1802 "id": 10,
1803 "colour": {}
1804 },
1805 {
1806 "id": 12,
1807 "colour": [1]
1808 },
1809 {
1810 "id": 13
1811 },
1812 {
1813 "id": 14,
1814 "colour": {
1815 "green": 1
1816 }
1817 },
1818 {
1819 "id": 15,
1820 "colour": {
1821 "green": {
1822 "blue": []
1823 }
1824 }
1825 }
1826 ])
1827 };
1828
1829 let check_ok = |index: &Index| {
1830 let rtxn = index.read_txn().unwrap();
1831
1832 let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
1833 let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
1834 let colour_blue_id =
1835 index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
1836
1837 let bitmap_null_colour =
1838 index.facet_id_is_null_docids.get(&rtxn, &colour_id).unwrap().unwrap();
1839 assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
1840
1841 let bitmap_colour_green =
1842 index.facet_id_is_null_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
1843 assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
1844
1845 let bitmap_colour_blue =
1846 index.facet_id_is_null_docids.get(&rtxn, &colour_blue_id).unwrap().unwrap();
1847 assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
1848 };
1849
1850 let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())];
1851
1852 let index = TempIndex::new();
1853 index.add_documents(content()).unwrap();
1854 index
1855 .update_settings(|settings| {
1856 settings.set_filterable_fields(faceted_fields.clone());
1857 })
1858 .unwrap();
1859 check_ok(&index);
1860
1861 let index = TempIndex::new();
1862 index
1863 .update_settings(|settings| {
1864 settings.set_filterable_fields(faceted_fields.clone());
1865 })
1866 .unwrap();
1867 index.add_documents(content()).unwrap();
1868 check_ok(&index);
1869 }
1870
1871 #[test]
1872 fn index_documents_check_is_empty_database() {
1873 let content = || {
1874 documents!([
1875 {"id": 0, "tags": null },
1876 {"id": 1, "tags": [null] },
1877 {"id": 2, "tags": [] },
1878 {"id": 3, "tags": ["hello","world"] },
1879 {"id": 4, "tags": [""] },
1880 {"id": 5 },
1881 {"id": 6, "tags": {} },
1882 {"id": 7, "tags": {"green": "cool"} },
1883 {"id": 8, "tags": {"green": ""} },
1884 {"id": 9, "tags": "" },
1885 {"id": 10, "tags": { "green": null } },
1886 {"id": 11, "tags": { "green": { "blue": null } } },
1887 {"id": 12, "tags": { "green": { "blue": [] } } }
1888 ])
1889 };
1890
1891 let check_ok = |index: &Index| {
1892 let rtxn = index.read_txn().unwrap();
1893
1894 let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
1895 let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
1896 let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
1897
1898 let bitmap_empty_tags =
1899 index.facet_id_is_empty_docids.get(&rtxn, &tags_id).unwrap().unwrap();
1900 assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
1901
1902 let bitmap_tags_green =
1903 index.facet_id_is_empty_docids.get(&rtxn, &tags_green_id).unwrap().unwrap();
1904 assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
1905
1906 let bitmap_tags_blue =
1907 index.facet_id_is_empty_docids.get(&rtxn, &tags_blue_id).unwrap().unwrap();
1908 assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
1909 };
1910
1911 let faceted_fields = vec![FilterableAttributesRule::Field("tags".to_string())];
1912
1913 let index = TempIndex::new();
1914 index.add_documents(content()).unwrap();
1915 index
1916 .update_settings(|settings| {
1917 settings.set_filterable_fields(faceted_fields.clone());
1918 })
1919 .unwrap();
1920 check_ok(&index);
1921
1922 let index = TempIndex::new();
1923 index
1924 .update_settings(|settings| {
1925 settings.set_filterable_fields(faceted_fields.clone());
1926 })
1927 .unwrap();
1928 index.add_documents(content()).unwrap();
1929 check_ok(&index);
1930 }
1931
1932 #[test]
1933 fn primary_key_must_not_contain_floats() {
1934 let index = TempIndex::new_with_map_size(4096 * 100);
1935
1936 let doc1 = documents! {[{
1937 "id": -228142,
1938 "title": "asdsad",
1939 }]};
1940
1941 let doc2 = documents! {[{
1942 "id": 228143.56,
1943 "title": "something",
1944 }]};
1945
1946 let doc3 = documents! {[{
1947 "id": -228143.56,
1948 "title": "something",
1949 }]};
1950
1951 let doc4 = documents! {[{
1952 "id": 2.0,
1953 "title": "something",
1954 }]};
1955
1956 let rtxn = index.inner.read_txn().unwrap();
1957 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
1958 let mut new_fields_ids_map = db_fields_ids_map.clone();
1959
1960 let mut indexer = indexer::DocumentOperation::new();
1961 indexer.replace_documents(&doc1).unwrap();
1962 indexer.replace_documents(&doc2).unwrap();
1963 indexer.replace_documents(&doc3).unwrap();
1964 indexer.replace_documents(&doc4).unwrap();
1965
1966 let indexer_alloc = Bump::new();
1967 let (_document_changes, operation_stats, _primary_key) = indexer
1968 .into_changes(
1969 &indexer_alloc,
1970 &index.inner,
1971 &rtxn,
1972 None,
1973 &mut new_fields_ids_map,
1974 &|| false,
1975 Progress::default(),
1976 )
1977 .unwrap();
1978
1979 assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 1);
1980 assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_some()).count(), 3);
1981 }
1982
1983 #[test]
1984 fn mixing_documents_replace_with_updates() {
1985 let index = TempIndex::new_with_map_size(4096 * 100);
1986
1987 let doc1 = documents! {[{
1988 "id": 1,
1989 "title": "asdsad",
1990 "description": "Wat wat wat, wat"
1991 }]};
1992
1993 let doc2 = documents! {[{
1994 "id": 1,
1995 "title": "something",
1996 }]};
1997
1998 let doc3 = documents! {[{
1999 "id": 1,
2000 "title": "another something",
2001 }]};
2002
2003 let doc4 = documents! {[{
2004 "id": 1,
2005 "description": "This is it!",
2006 }]};
2007
2008 let rtxn = index.inner.read_txn().unwrap();
2009 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2010 let mut new_fields_ids_map = db_fields_ids_map.clone();
2011
2012 let mut indexer = indexer::DocumentOperation::new();
2013 indexer.replace_documents(&doc1).unwrap();
2014 indexer.update_documents(&doc2).unwrap();
2015 indexer.update_documents(&doc3).unwrap();
2016 indexer.update_documents(&doc4).unwrap();
2017
2018 let indexer_alloc = Bump::new();
2019 let (document_changes, operation_stats, primary_key) = indexer
2020 .into_changes(
2021 &indexer_alloc,
2022 &index.inner,
2023 &rtxn,
2024 None,
2025 &mut new_fields_ids_map,
2026 &|| false,
2027 Progress::default(),
2028 )
2029 .unwrap();
2030
2031 assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 4);
2032
2033 let mut wtxn = index.write_txn().unwrap();
2034 indexer::index(
2035 &mut wtxn,
2036 &index.inner,
2037 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2038 index.indexer_config.grenad_parameters(),
2039 &db_fields_ids_map,
2040 new_fields_ids_map,
2041 primary_key,
2042 &document_changes,
2043 EmbeddingConfigs::default(),
2044 &|| false,
2045 &Progress::default(),
2046 )
2047 .unwrap();
2048 wtxn.commit().unwrap();
2049
2050 let rtxn = index.read_txn().unwrap();
2051 let obkv = index.document(&rtxn, 0).unwrap();
2052 let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
2053
2054 let json_document = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
2055 let expected = serde_json::json!({
2056 "id": 1,
2057 "title": "another something",
2058 "description": "This is it!",
2059 });
2060 let expected = expected.as_object().unwrap();
2061 assert_eq!(&json_document, expected);
2062 }
2063
2064 #[test]
2065 fn mixing_documents_replace_with_updates_even_more() {
2066 let index = TempIndex::new_with_map_size(4096 * 100);
2067
2068 let doc1 = documents! {[{
2069 "id": 1,
2070 "title": "asdsad",
2071 "description": "Wat wat wat, wat"
2072 }]};
2073
2074 let doc2 = documents! {[{
2075 "id": 1,
2076 "title": "something",
2077 }]};
2078
2079 let doc3 = documents! {[{
2080 "id": 1,
2081 "title": "another something",
2082 }]};
2083
2084 let doc4 = documents! {[{
2085 "id": 1,
2086 "title": "Woooof",
2087 }]};
2088
2089 let doc5 = documents! {[{
2090 "id": 1,
2091 "description": "This is it!",
2092 }]};
2093
2094 let rtxn = index.inner.read_txn().unwrap();
2095 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2096 let mut new_fields_ids_map = db_fields_ids_map.clone();
2097
2098 let mut indexer = indexer::DocumentOperation::new();
2099 indexer.replace_documents(&doc1).unwrap();
2100 indexer.update_documents(&doc2).unwrap();
2101 indexer.update_documents(&doc3).unwrap();
2102 indexer.replace_documents(&doc4).unwrap();
2103 indexer.update_documents(&doc5).unwrap();
2104
2105 let indexer_alloc = Bump::new();
2106 let (document_changes, operation_stats, primary_key) = indexer
2107 .into_changes(
2108 &indexer_alloc,
2109 &index.inner,
2110 &rtxn,
2111 None,
2112 &mut new_fields_ids_map,
2113 &|| false,
2114 Progress::default(),
2115 )
2116 .unwrap();
2117
2118 assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 5);
2119
2120 let mut wtxn = index.write_txn().unwrap();
2121 indexer::index(
2122 &mut wtxn,
2123 &index.inner,
2124 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2125 index.indexer_config.grenad_parameters(),
2126 &db_fields_ids_map,
2127 new_fields_ids_map,
2128 primary_key,
2129 &document_changes,
2130 EmbeddingConfigs::default(),
2131 &|| false,
2132 &Progress::default(),
2133 )
2134 .unwrap();
2135 wtxn.commit().unwrap();
2136
2137 let rtxn = index.read_txn().unwrap();
2138 let obkv = index.document(&rtxn, 0).unwrap();
2139 let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
2140
2141 let json_document = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
2142 let expected = serde_json::json!({
2143 "id": 1,
2144 "title": "Woooof",
2145 "description": "This is it!",
2146 });
2147 let expected = expected.as_object().unwrap();
2148 assert_eq!(&json_document, expected);
2149 }
2150
2151 #[test]
2152 fn primary_key_must_not_contain_whitespace() {
2153 let index = TempIndex::new();
2154
2155 let doc1 = documents! {[{
2156 "id": " 1",
2157 "title": "asdsad",
2158 }]};
2159
2160 let doc2 = documents! {[{
2161 "id": "\t2",
2162 "title": "something",
2163 }]};
2164
2165 let doc3 = documents! {[{
2166 "id": "\r3",
2167 "title": "something",
2168 }]};
2169
2170 let doc4 = documents! {[{
2171 "id": "\n4",
2172 "title": "something",
2173 }]};
2174
2175 index.add_documents(doc1).unwrap_err();
2176 index.add_documents(doc2).unwrap_err();
2177 index.add_documents(doc3).unwrap_err();
2178 index.add_documents(doc4).unwrap_err();
2179 }
2180
2181 #[test]
2182 fn primary_key_inference() {
2183 let index = TempIndex::new();
2184
2185 let doc_no_id = documents! {[{
2186 "title": "asdsad",
2187 "state": "automated",
2188 "priority": "normal",
2189 "branch_id_number": 0
2190 }]};
2191 assert!(matches!(
2192 index.add_documents(doc_no_id),
2193 Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
2194 ));
2195
2196 let doc_multiple_ids = documents! {[{
2197 "id": 228143,
2198 "title": "something",
2199 "state": "automated",
2200 "priority": "normal",
2201 "public_uid": "39c6499b",
2202 "project_id": 78207,
2203 "branch_id_number": 0
2204 }]};
2205
2206 let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { candidates })) =
2207 index.add_documents(doc_multiple_ids)
2208 else {
2209 panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)")
2210 };
2211
2212 assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
2213
2214 let doc_inferable = documents! {[{
2215 "video": "test.mp4",
2216 "id": 228143,
2217 "title": "something",
2218 "state": "automated",
2219 "priority": "normal",
2220 "public_uid_": "39c6499b",
2221 "project_id_": 78207,
2222 "branch_id_number": 0
2223 }]};
2224
2225 index.add_documents(doc_inferable).unwrap();
2226
2227 let txn = index.read_txn().unwrap();
2228
2229 assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
2230 }
2231
2232 #[test]
2233 fn long_words_must_be_skipped() {
2234 let index = TempIndex::new();
2235
2236 let long_word = "lol".repeat(1000);
2238 let doc1 = documents! {[{
2239 "id": "1",
2240 "title": long_word,
2241 }]};
2242
2243 index.add_documents(doc1).unwrap();
2244
2245 let rtxn = index.read_txn().unwrap();
2246 let words_fst = index.words_fst(&rtxn).unwrap();
2247 assert!(!words_fst.contains(&long_word));
2248 }
2249
2250 #[test]
2251 fn long_facet_values_must_not_crash() {
2252 let index = TempIndex::new();
2253
2254 let long_word = "lol".repeat(1000);
2256 let doc1 = documents! {[{
2257 "id": "1",
2258 "title": long_word,
2259 }]};
2260
2261 index
2262 .update_settings(|settings| {
2263 settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
2264 "title".to_string(),
2265 )]);
2266 })
2267 .unwrap();
2268
2269 index.add_documents(doc1).unwrap();
2270 }
2271
2272 #[test]
2273 fn add_and_delete_documents_in_single_transform() {
2274 let mut index = TempIndex::new();
2275 index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
2276
2277 let mut wtxn = index.write_txn().unwrap();
2278 let indexer_config = &index.indexer_config;
2279 let rtxn = index.inner.read_txn().unwrap();
2280 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2281 let mut new_fields_ids_map = db_fields_ids_map.clone();
2282
2283 let documents = documents!([
2284 { "id": 1, "doggo": "kevin" },
2285 { "id": 2, "doggo": { "name": "bob", "age": 20 } },
2286 { "id": 3, "name": "jean", "age": 25 },
2287 ]);
2288
2289 let indexer_alloc = Bump::new();
2290 let embedders = EmbeddingConfigs::default();
2291 let mut indexer = indexer::DocumentOperation::new();
2292 indexer.replace_documents(&documents).unwrap();
2293 indexer.delete_documents(&["2"]);
2294 let (document_changes, _operation_stats, primary_key) = indexer
2295 .into_changes(
2296 &indexer_alloc,
2297 &index.inner,
2298 &rtxn,
2299 None,
2300 &mut new_fields_ids_map,
2301 &|| false,
2302 Progress::default(),
2303 )
2304 .unwrap();
2305
2306 indexer::index(
2307 &mut wtxn,
2308 &index.inner,
2309 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2310 indexer_config.grenad_parameters(),
2311 &db_fields_ids_map,
2312 new_fields_ids_map,
2313 primary_key,
2314 &document_changes,
2315 embedders,
2316 &|| false,
2317 &Progress::default(),
2318 )
2319 .unwrap();
2320 wtxn.commit().unwrap();
2321
2322 db_snap!(index, documents, @r###"
2323 {"id":1,"doggo":"kevin"}
2324 {"id":3,"name":"jean","age":25}
2325 "###);
2326 }
2327
2328 #[test]
2329 fn add_update_and_delete_documents_in_single_transform() {
2330 let mut index = TempIndex::new();
2331 index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
2332
2333 let mut wtxn = index.write_txn().unwrap();
2334 let indexer_config = &index.indexer_config;
2335 let rtxn = index.inner.read_txn().unwrap();
2336 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2337 let mut new_fields_ids_map = db_fields_ids_map.clone();
2338
2339 let documents = documents!([
2340 { "id": 1, "doggo": "kevin" },
2341 { "id": 2, "doggo": { "name": "bob", "age": 20 } },
2342 { "id": 3, "name": "jean", "age": 25 },
2343 ]);
2344 let mut indexer = indexer::DocumentOperation::new();
2345 indexer.update_documents(&documents).unwrap();
2346
2347 let documents = documents!([
2348 { "id": 2, "catto": "jorts" },
2349 { "id": 3, "legs": 4 },
2350 ]);
2351 indexer.update_documents(&documents).unwrap();
2352 indexer.delete_documents(&["1", "2"]);
2353
2354 let indexer_alloc = Bump::new();
2355 let embedders = EmbeddingConfigs::default();
2356 let (document_changes, _operation_stats, primary_key) = indexer
2357 .into_changes(
2358 &indexer_alloc,
2359 &index.inner,
2360 &rtxn,
2361 None,
2362 &mut new_fields_ids_map,
2363 &|| false,
2364 Progress::default(),
2365 )
2366 .unwrap();
2367
2368 indexer::index(
2369 &mut wtxn,
2370 &index.inner,
2371 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2372 indexer_config.grenad_parameters(),
2373 &db_fields_ids_map,
2374 new_fields_ids_map,
2375 primary_key,
2376 &document_changes,
2377 embedders,
2378 &|| false,
2379 &Progress::default(),
2380 )
2381 .unwrap();
2382 wtxn.commit().unwrap();
2383
2384 db_snap!(index, documents, @r###"
2385 {"id":3,"name":"jean","age":25,"legs":4}
2386 "###);
2387 }
2388
2389 #[test]
2390 fn add_document_and_in_another_transform_update_and_delete_documents() {
2391 let index = TempIndex::new();
2392
2393 let mut wtxn = index.write_txn().unwrap();
2394 let indexer_config = &index.indexer_config;
2395 let rtxn = index.inner.read_txn().unwrap();
2396 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2397 let mut new_fields_ids_map = db_fields_ids_map.clone();
2398
2399 let documents = documents!([
2400 { "id": 1, "doggo": "kevin" },
2401 { "id": 2, "doggo": { "name": "bob", "age": 20 } },
2402 { "id": 3, "name": "jean", "age": 25 },
2403 ]);
2404 let indexer_alloc = Bump::new();
2405 let embedders = EmbeddingConfigs::default();
2406 let mut indexer = indexer::DocumentOperation::new();
2407 indexer.update_documents(&documents).unwrap();
2408
2409 let (document_changes, _operation_stats, primary_key) = indexer
2410 .into_changes(
2411 &indexer_alloc,
2412 &index.inner,
2413 &rtxn,
2414 None,
2415 &mut new_fields_ids_map,
2416 &|| false,
2417 Progress::default(),
2418 )
2419 .unwrap();
2420
2421 indexer::index(
2422 &mut wtxn,
2423 &index.inner,
2424 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2425 indexer_config.grenad_parameters(),
2426 &db_fields_ids_map,
2427 new_fields_ids_map,
2428 primary_key,
2429 &document_changes,
2430 embedders,
2431 &|| false,
2432 &Progress::default(),
2433 )
2434 .unwrap();
2435 wtxn.commit().unwrap();
2436
2437 db_snap!(index, documents, @r###"
2438 {"id":1,"doggo":"kevin"}
2439 {"id":2,"doggo":{"name":"bob","age":20}}
2440 {"id":3,"name":"jean","age":25}
2441 "###);
2442
2443 let mut wtxn = index.write_txn().unwrap();
2446 let indexer_config = &index.indexer_config;
2447 let rtxn = index.inner.read_txn().unwrap();
2448 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2449 let mut new_fields_ids_map = db_fields_ids_map.clone();
2450
2451 let documents = documents!([
2452 { "id": 2, "catto": "jorts" },
2453 { "id": 3, "legs": 4 },
2454 ]);
2455 let indexer_alloc = Bump::new();
2456 let embedders = EmbeddingConfigs::default();
2457 let mut indexer = indexer::DocumentOperation::new();
2458 indexer.update_documents(&documents).unwrap();
2459 indexer.delete_documents(&["1", "2"]);
2460
2461 let (document_changes, _operation_stats, primary_key) = indexer
2462 .into_changes(
2463 &indexer_alloc,
2464 &index.inner,
2465 &rtxn,
2466 None,
2467 &mut new_fields_ids_map,
2468 &|| false,
2469 Progress::default(),
2470 )
2471 .unwrap();
2472
2473 indexer::index(
2474 &mut wtxn,
2475 &index.inner,
2476 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2477 indexer_config.grenad_parameters(),
2478 &db_fields_ids_map,
2479 new_fields_ids_map,
2480 primary_key,
2481 &document_changes,
2482 embedders,
2483 &|| false,
2484 &Progress::default(),
2485 )
2486 .unwrap();
2487 wtxn.commit().unwrap();
2488
2489 db_snap!(index, documents, @r###"
2490 {"id":3,"name":"jean","age":25,"legs":4}
2491 "###);
2492 }
2493
2494 #[test]
2495 fn delete_document_and_then_add_documents_in_the_same_transform() {
2496 let index = TempIndex::new();
2497
2498 let mut wtxn = index.write_txn().unwrap();
2499 let indexer_config = &index.indexer_config;
2500 let rtxn = index.inner.read_txn().unwrap();
2501 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2502 let mut new_fields_ids_map = db_fields_ids_map.clone();
2503
2504 let indexer_alloc = Bump::new();
2505 let embedders = EmbeddingConfigs::default();
2506 let mut indexer = indexer::DocumentOperation::new();
2507 indexer.delete_documents(&["1", "2"]);
2508
2509 let documents = documents!([
2510 { "id": 2, "doggo": { "name": "jean", "age": 20 } },
2511 { "id": 3, "name": "bob", "age": 25 },
2512 ]);
2513 indexer.update_documents(&documents).unwrap();
2514
2515 let (document_changes, _operation_stats, primary_key) = indexer
2516 .into_changes(
2517 &indexer_alloc,
2518 &index.inner,
2519 &rtxn,
2520 None,
2521 &mut new_fields_ids_map,
2522 &|| false,
2523 Progress::default(),
2524 )
2525 .unwrap();
2526
2527 indexer::index(
2528 &mut wtxn,
2529 &index.inner,
2530 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2531 indexer_config.grenad_parameters(),
2532 &db_fields_ids_map,
2533 new_fields_ids_map,
2534 primary_key,
2535 &document_changes,
2536 embedders,
2537 &|| false,
2538 &Progress::default(),
2539 )
2540 .unwrap();
2541 wtxn.commit().unwrap();
2542
2543 db_snap!(index, documents, @r###"
2544 {"id":2,"doggo":{"name":"jean","age":20}}
2545 {"id":3,"name":"bob","age":25}
2546 "###);
2547 }
2548
2549 #[test]
2550 fn delete_the_same_document_multiple_time() {
2551 let index = TempIndex::new();
2552
2553 let mut wtxn = index.write_txn().unwrap();
2554 let indexer_config = &index.indexer_config;
2555 let rtxn = index.inner.read_txn().unwrap();
2556 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2557 let mut new_fields_ids_map = db_fields_ids_map.clone();
2558
2559 let indexer_alloc = Bump::new();
2560 let embedders = EmbeddingConfigs::default();
2561 let mut indexer = indexer::DocumentOperation::new();
2562
2563 indexer.delete_documents(&["1", "2", "1", "2"]);
2564
2565 let documents = documents!([
2566 { "id": 1, "doggo": "kevin" },
2567 { "id": 2, "doggo": { "name": "jean", "age": 20 } },
2568 { "id": 3, "name": "bob", "age": 25 },
2569 ]);
2570 indexer.update_documents(&documents).unwrap();
2571
2572 indexer.delete_documents(&["1", "2", "1", "2"]);
2573
2574 let (document_changes, _operation_stats, primary_key) = indexer
2575 .into_changes(
2576 &indexer_alloc,
2577 &index.inner,
2578 &rtxn,
2579 None,
2580 &mut new_fields_ids_map,
2581 &|| false,
2582 Progress::default(),
2583 )
2584 .unwrap();
2585
2586 indexer::index(
2587 &mut wtxn,
2588 &index.inner,
2589 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2590 indexer_config.grenad_parameters(),
2591 &db_fields_ids_map,
2592 new_fields_ids_map,
2593 primary_key,
2594 &document_changes,
2595 embedders,
2596 &|| false,
2597 &Progress::default(),
2598 )
2599 .unwrap();
2600 wtxn.commit().unwrap();
2601
2602 db_snap!(index, documents, @r###"
2603 {"id":3,"name":"bob","age":25}
2604 "###);
2605 }
2606
2607 #[test]
2608 fn add_document_and_in_another_transform_delete_the_document_then_add_it_again() {
2609 let index = TempIndex::new();
2610
2611 let mut wtxn = index.write_txn().unwrap();
2612 let indexer_config = &index.indexer_config;
2613 let rtxn = index.inner.read_txn().unwrap();
2614 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2615 let mut new_fields_ids_map = db_fields_ids_map.clone();
2616
2617 let indexer_alloc = Bump::new();
2618 let embedders = EmbeddingConfigs::default();
2619 let mut indexer = indexer::DocumentOperation::new();
2620
2621 let documents = documents!([
2622 { "id": 1, "doggo": "kevin" },
2623 ]);
2624 indexer.update_documents(&documents).unwrap();
2625
2626 let (document_changes, _operation_stats, primary_key) = indexer
2627 .into_changes(
2628 &indexer_alloc,
2629 &index.inner,
2630 &rtxn,
2631 None,
2632 &mut new_fields_ids_map,
2633 &|| false,
2634 Progress::default(),
2635 )
2636 .unwrap();
2637
2638 indexer::index(
2639 &mut wtxn,
2640 &index.inner,
2641 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2642 indexer_config.grenad_parameters(),
2643 &db_fields_ids_map,
2644 new_fields_ids_map,
2645 primary_key,
2646 &document_changes,
2647 embedders,
2648 &|| false,
2649 &Progress::default(),
2650 )
2651 .unwrap();
2652 wtxn.commit().unwrap();
2653
2654 db_snap!(index, documents, @r###"
2655 {"id":1,"doggo":"kevin"}
2656 "###);
2657
2658 let mut wtxn = index.write_txn().unwrap();
2661 let indexer_config = &index.indexer_config;
2662 let rtxn = index.inner.read_txn().unwrap();
2663 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2664 let mut new_fields_ids_map = db_fields_ids_map.clone();
2665
2666 let indexer_alloc = Bump::new();
2667 let embedders = EmbeddingConfigs::default();
2668 let mut indexer = indexer::DocumentOperation::new();
2669
2670 indexer.delete_documents(&["1"]);
2671
2672 let documents = documents!([
2673 { "id": 1, "catto": "jorts" },
2674 ]);
2675
2676 indexer.replace_documents(&documents).unwrap();
2677
2678 let (document_changes, _operation_stats, primary_key) = indexer
2679 .into_changes(
2680 &indexer_alloc,
2681 &index.inner,
2682 &rtxn,
2683 None,
2684 &mut new_fields_ids_map,
2685 &|| false,
2686 Progress::default(),
2687 )
2688 .unwrap();
2689
2690 indexer::index(
2691 &mut wtxn,
2692 &index.inner,
2693 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2694 indexer_config.grenad_parameters(),
2695 &db_fields_ids_map,
2696 new_fields_ids_map,
2697 primary_key,
2698 &document_changes,
2699 embedders,
2700 &|| false,
2701 &Progress::default(),
2702 )
2703 .unwrap();
2704 wtxn.commit().unwrap();
2705
2706 db_snap!(index, documents, @r###"
2707 {"id":1,"catto":"jorts"}
2708 "###);
2709 }
2710
2711 #[test]
2712 fn test_word_fid_position() {
2713 let index = TempIndex::new();
2714
2715 index
2716 .add_documents(documents!([
2717 {"id": 0, "text": "sun flowers are looking at the sun" },
2718 {"id": 1, "text": "sun flowers are looking at the sun" },
2719 {"id": 2, "text": "the sun is shining today" },
2720 {
2721 "id": 3,
2722 "text": "a a a a a a a a a a a a a a a a a
2723 a a a a a a a a a a a a a a a a a a a a a a a a a a
2724 a a a a a a a a a a a a a a a a a a a a a a a a a a
2725 a a a a a a a a a a a a a a a a a a a a a a a a a a
2726 a a a a a a a a a a a a a a a a a a a a a a a a a a
2727 a a a a a a a a a a a a a a a a a a a a a a a a a a
2728 a a a a a a a a a a a a a a a a a a a a a "
2729 }
2730 ]))
2731 .unwrap();
2732
2733 db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
2734 db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
2735
2736 index
2737 .add_documents(documents!([
2738 {"id": 4, "text": "sun flowers are looking at the sun" },
2739 {"id": 5, "text2": "sun flowers are looking at the sun" },
2740 {"id": 6, "text": "b b b" },
2741 {
2742 "id": 7,
2743 "text2": "a a a a"
2744 }
2745 ]))
2746 .unwrap();
2747
2748 db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
2749 db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
2750
2751 index.delete_documents(vec!["0".into(), "3".into()]);
2753
2754 db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
2755 db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
2756 }
2757
2758 #[test]
2761 fn test_multiple_vectors() {
2762 use crate::vector::settings::EmbeddingSettings;
2763 let index = TempIndex::new();
2764
2765 index
2766 .update_settings(|settings| {
2767 let mut embedders = BTreeMap::default();
2768 embedders.insert(
2769 "manual".to_string(),
2770 Setting::Set(EmbeddingSettings {
2771 source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided),
2772 model: Setting::NotSet,
2773 revision: Setting::NotSet,
2774 pooling: Setting::NotSet,
2775 api_key: Setting::NotSet,
2776 dimensions: Setting::Set(3),
2777 document_template: Setting::NotSet,
2778 document_template_max_bytes: Setting::NotSet,
2779 url: Setting::NotSet,
2780 request: Setting::NotSet,
2781 response: Setting::NotSet,
2782 distribution: Setting::NotSet,
2783 headers: Setting::NotSet,
2784 search_embedder: Setting::NotSet,
2785 indexing_embedder: Setting::NotSet,
2786 binary_quantized: Setting::NotSet,
2787 }),
2788 );
2789 settings.set_embedder_settings(embedders);
2790 })
2791 .unwrap();
2792
2793 index
2794 .add_documents(
2795 documents!([{"id": 0, "_vectors": { "manual": [[0, 1, 2], [3, 4, 5]] } }]),
2796 )
2797 .unwrap();
2798 index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap();
2799 index
2800 .add_documents(
2801 documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]),
2802 )
2803 .unwrap();
2804
2805 let rtxn = index.read_txn().unwrap();
2806 let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
2807 let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
2808 embedding_configs.pop().unwrap();
2809 insta::assert_snapshot!(embedder_name, @"manual");
2810 insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
2811 let embedder = std::sync::Arc::new(
2812 crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(),
2813 );
2814 let res = index
2815 .search(&rtxn)
2816 .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()))
2817 .execute()
2818 .unwrap();
2819 assert_eq!(res.documents_ids.len(), 3);
2820 }
2821
2822 #[test]
2823 fn reproduce_the_bug() {
2824 let index = TempIndex::new();
2853
2854 println!("--- ENTERING BATCH 1");
2857
2858 let mut wtxn = index.write_txn().unwrap();
2859 let indexer_config = &index.indexer_config;
2860 let rtxn = index.inner.read_txn().unwrap();
2861 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2862 let mut new_fields_ids_map = db_fields_ids_map.clone();
2863
2864 let indexer_alloc = Bump::new();
2865 let embedders = EmbeddingConfigs::default();
2866 let mut indexer = indexer::DocumentOperation::new();
2867
2868 let documents = documents!([
2871 { "id": 1, "doggo": "bernese" },
2872 ]);
2873 indexer.replace_documents(&documents).unwrap();
2874
2875 let (document_changes, _operation_stats, primary_key) = indexer
2877 .into_changes(
2878 &indexer_alloc,
2879 &index.inner,
2880 &rtxn,
2881 None,
2882 &mut new_fields_ids_map,
2883 &|| false,
2884 Progress::default(),
2885 )
2886 .unwrap();
2887
2888 indexer::index(
2889 &mut wtxn,
2890 &index.inner,
2891 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2892 indexer_config.grenad_parameters(),
2893 &db_fields_ids_map,
2894 new_fields_ids_map,
2895 primary_key,
2896 &document_changes,
2897 embedders,
2898 &|| false,
2899 &Progress::default(),
2900 )
2901 .unwrap();
2902 wtxn.commit().unwrap();
2903
2904 db_snap!(index, documents, @r###"
2905 {"id":1,"doggo":"bernese"}
2906 "###);
2907 db_snap!(index, external_documents_ids, @r###"
2908 docids:
2909 1 0
2910 "###);
2911
2912 println!("--- ENTERING BATCH 2");
2917
2918 let mut wtxn = index.write_txn().unwrap();
2919 let indexer_config = &index.indexer_config;
2920 let rtxn = index.inner.read_txn().unwrap();
2921 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2922 let mut new_fields_ids_map = db_fields_ids_map.clone();
2923
2924 let indexer_alloc = Bump::new();
2925 let embedders = EmbeddingConfigs::default();
2926 let mut indexer = indexer::DocumentOperation::new();
2927
2928 indexer.delete_documents(&["1"]);
2929
2930 let documents = documents!([
2931 { "id": 0, "catto": "jorts" },
2932 ]);
2933 indexer.replace_documents(&documents).unwrap();
2934
2935 let (document_changes, _operation_stats, primary_key) = indexer
2936 .into_changes(
2937 &indexer_alloc,
2938 &index.inner,
2939 &rtxn,
2940 None,
2941 &mut new_fields_ids_map,
2942 &|| false,
2943 Progress::default(),
2944 )
2945 .unwrap();
2946
2947 indexer::index(
2948 &mut wtxn,
2949 &index.inner,
2950 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2951 indexer_config.grenad_parameters(),
2952 &db_fields_ids_map,
2953 new_fields_ids_map,
2954 primary_key,
2955 &document_changes,
2956 embedders,
2957 &|| false,
2958 &Progress::default(),
2959 )
2960 .unwrap();
2961 wtxn.commit().unwrap();
2962
2963 db_snap!(index, documents, @r###"
2964 {"id":0,"catto":"jorts"}
2965 "###);
2966
2967 db_snap!(index, external_documents_ids, @r###"
2968 docids:
2969 0 1
2970 "###);
2971
2972 println!("--- ENTERING BATCH 3");
2975
2976 let mut wtxn = index.write_txn().unwrap();
2977 let indexer_config = &index.indexer_config;
2978 let rtxn = index.inner.read_txn().unwrap();
2979 let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
2980 let mut new_fields_ids_map = db_fields_ids_map.clone();
2981
2982 let indexer_alloc = Bump::new();
2983 let embedders = EmbeddingConfigs::default();
2984 let mut indexer = indexer::DocumentOperation::new();
2985
2986 let documents = documents!([
2987 { "id": 1, "catto": "jorts" },
2988 ]);
2989 indexer.replace_documents(&documents).unwrap();
2990
2991 let (document_changes, _operation_stats, primary_key) = indexer
2992 .into_changes(
2993 &indexer_alloc,
2994 &index.inner,
2995 &rtxn,
2996 None,
2997 &mut new_fields_ids_map,
2998 &|| false,
2999 Progress::default(),
3000 )
3001 .unwrap();
3002
3003 indexer::index(
3004 &mut wtxn,
3005 &index.inner,
3006 &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
3007 indexer_config.grenad_parameters(),
3008 &db_fields_ids_map,
3009 new_fields_ids_map,
3010 primary_key,
3011 &document_changes,
3012 embedders,
3013 &|| false,
3014 &Progress::default(),
3015 )
3016 .unwrap();
3017 wtxn.commit().unwrap();
3018
3019 db_snap!(index, documents, @r###"
3020 {"id":1,"catto":"jorts"}
3021 {"id":0,"catto":"jorts"}
3022 "###);
3023
3024 let rtxn = index.read_txn().unwrap();
3026 let res = index.search(&rtxn).execute().unwrap();
3027 index.documents(&rtxn, res.documents_ids).unwrap();
3028 }
3029
3030 fn delete_documents<'t>(
3031 wtxn: &mut RwTxn<'t>,
3032 index: &'t TempIndex,
3033 external_ids: &[&str],
3034 ) -> Vec<u32> {
3035 let external_document_ids = index.external_documents_ids();
3036 let ids_to_delete: Vec<u32> = external_ids
3037 .iter()
3038 .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
3039 .collect();
3040
3041 index
3043 .delete_documents_using_wtxn(
3044 wtxn,
3045 external_ids.iter().map(ToString::to_string).collect(),
3046 )
3047 .unwrap();
3048
3049 ids_to_delete
3050 }
3051
3052 #[test]
3053 fn delete_documents_with_numbers_as_primary_key() {
3054 let index = TempIndex::new();
3055
3056 let mut wtxn = index.write_txn().unwrap();
3057 index
3058 .add_documents_using_wtxn(
3059 &mut wtxn,
3060 documents!([
3061 { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
3062 { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
3063 { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
3064 ]),
3065 )
3066 .unwrap();
3067 wtxn.commit().unwrap();
3068
3069 let mut wtxn = index.write_txn().unwrap(); index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]).unwrap();
3071 wtxn.commit().unwrap();
3072
3073 db_snap!(index, documents_ids);
3075 db_snap!(index, word_docids);
3076 db_snap!(index, word_pair_proximity_docids);
3077 db_snap!(index, facet_id_exists_docids);
3078
3079 let rtxn = index.read_txn().unwrap();
3080
3081 assert!(index.field_distribution(&rtxn).unwrap().is_empty());
3082 }
3083
3084 #[test]
3085 fn delete_documents_with_strange_primary_key() {
3086 let index = TempIndex::new();
3087
3088 index
3089 .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
3090 .unwrap();
3091
3092 let mut wtxn = index.write_txn().unwrap();
3093 index
3094 .add_documents_using_wtxn(
3095 &mut wtxn,
3096 documents!([
3097 { "mysuperid": 0, "name": "kevin" },
3098 { "mysuperid": 1, "name": "kevina" },
3099 { "mysuperid": 2, "name": "benoit" }
3100 ]),
3101 )
3102 .unwrap();
3103 wtxn.commit().unwrap();
3104
3105 let mut wtxn = index.write_txn().unwrap();
3106 index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]).unwrap();
3108
3109 wtxn.commit().unwrap();
3110
3111 db_snap!(index, documents_ids);
3112 db_snap!(index, word_docids);
3113 db_snap!(index, word_pair_proximity_docids);
3114 }
3115
3116 #[test]
3117 fn filtered_placeholder_search_should_not_return_deleted_documents() {
3118 let index = TempIndex::new();
3119
3120 let mut wtxn = index.write_txn().unwrap();
3121 index
3122 .update_settings_using_wtxn(&mut wtxn, |settings| {
3123 settings.set_primary_key(S("docid"));
3124 settings.set_filterable_fields(vec![
3125 FilterableAttributesRule::Field("label".to_string()),
3126 FilterableAttributesRule::Field("label2".to_string()),
3127 ]);
3128 })
3129 .unwrap();
3130 wtxn.commit().unwrap();
3131
3132 let mut wtxn = index.write_txn().unwrap();
3133 index
3134 .add_documents_using_wtxn(
3135 &mut wtxn,
3136 documents!([
3137 { "docid": "1_4", "label": ["sign"] },
3138 { "docid": "1_5", "label": ["letter"] },
3139 { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
3140 { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3141 { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3142 { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3143 { "docid": "1_39", "label": ["abstract"] },
3144 { "docid": "1_40", "label": ["cartoon"] },
3145 { "docid": "1_41", "label": ["art","drawing"] },
3146 { "docid": "1_42", "label": ["art","pattern"] },
3147 { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3148 { "docid": "1_44", "label": ["drawing"] },
3149 { "docid": "1_45", "label": ["art"] },
3150 { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3151 { "docid": "1_47", "label": ["abstract","pattern"] },
3152 { "docid": "1_52", "label": ["abstract","cartoon"] },
3153 { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3154 { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3155 { "docid": "1_68", "label": ["design"] },
3156 { "docid": "1_69", "label": ["geometry"] },
3157 { "docid": "1_70", "label2": ["geometry", 1.2] },
3158 { "docid": "1_71", "label2": ["design", 2.2] },
3159 { "docid": "1_72", "label2": ["geometry", 1.2] }
3160 ]),
3161 )
3162 .unwrap();
3163
3164 wtxn.commit().unwrap();
3165
3166 let mut wtxn = index.write_txn().unwrap();
3167 delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
3168 wtxn.commit().unwrap();
3169
3170 let rtxn = index.read_txn().unwrap();
3171 let filter = Filter::from_str("label = sign").unwrap().unwrap();
3173 let results = index.search(&rtxn).filter(filter).execute().unwrap();
3174 assert!(results.documents_ids.is_empty());
3175
3176 db_snap!(index, word_docids);
3177 db_snap!(index, facet_id_f64_docids);
3178 db_snap!(index, word_pair_proximity_docids);
3179 db_snap!(index, facet_id_exists_docids);
3180 db_snap!(index, facet_id_string_docids);
3181 }
3182
3183 #[test]
3184 fn placeholder_search_should_not_return_deleted_documents() {
3185 let index = TempIndex::new();
3186
3187 index
3188 .update_settings(|settings| {
3189 settings.set_primary_key(S("docid"));
3190 })
3191 .unwrap();
3192
3193 index
3194 .add_documents(documents!([
3195 { "docid": "1_4", "label": ["sign"] },
3196 { "docid": "1_5", "label": ["letter"] },
3197 { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
3198 { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3199 { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3200 { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3201 { "docid": "1_39", "label": ["abstract"] },
3202 { "docid": "1_40", "label": ["cartoon"] },
3203 { "docid": "1_41", "label": ["art","drawing"] },
3204 { "docid": "1_42", "label": ["art","pattern"] },
3205 { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3206 { "docid": "1_44", "label": ["drawing"] },
3207 { "docid": "1_45", "label": ["art"] },
3208 { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3209 { "docid": "1_47", "label": ["abstract","pattern"] },
3210 { "docid": "1_52", "label": ["abstract","cartoon"] },
3211 { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3212 { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3213 { "docid": "1_68", "label": ["design"] },
3214 { "docid": "1_69", "label": ["geometry"] },
3215 { "docid": "1_70", "label2": ["geometry", 1.2] },
3216 { "docid": "1_71", "label2": ["design", 2.2] },
3217 { "docid": "1_72", "label2": ["geometry", 1.2] }
3218 ]))
3219 .unwrap();
3220
3221 let mut wtxn = index.write_txn().unwrap();
3222
3223 let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
3224
3225 wtxn.commit().unwrap();
3226
3227 let rtxn = index.static_read_txn().unwrap();
3229
3230 let results = index.search(&rtxn).execute().unwrap();
3231 assert!(!results.documents_ids.is_empty());
3232 for id in results.documents_ids.iter() {
3233 assert!(
3234 !deleted_internal_ids.contains(id),
3235 "The document {} was supposed to be deleted",
3236 id
3237 );
3238 }
3239
3240 drop(rtxn);
3241 }
3242
3243 #[test]
3244 fn search_should_not_return_deleted_documents() {
3245 let index = TempIndex::new();
3246
3247 index
3248 .update_settings(|settings| {
3249 settings.set_primary_key(S("docid"));
3250 })
3251 .unwrap();
3252
3253 index
3254 .add_documents(documents!([
3255 { "docid": "1_4", "label": ["sign"] },
3256 { "docid": "1_5", "label": ["letter"] },
3257 { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
3258 { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3259 { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3260 { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3261 { "docid": "1_39", "label": ["abstract"] },
3262 { "docid": "1_40", "label": ["cartoon"] },
3263 { "docid": "1_41", "label": ["art","drawing"] },
3264 { "docid": "1_42", "label": ["art","pattern"] },
3265 { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3266 { "docid": "1_44", "label": ["drawing"] },
3267 { "docid": "1_45", "label": ["art"] },
3268 { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3269 { "docid": "1_47", "label": ["abstract","pattern"] },
3270 { "docid": "1_52", "label": ["abstract","cartoon"] },
3271 { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3272 { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3273 { "docid": "1_68", "label": ["design"] },
3274 { "docid": "1_69", "label": ["geometry"] },
3275 { "docid": "1_70", "label2": ["geometry", 1.2] },
3276 { "docid": "1_71", "label2": ["design", 2.2] },
3277 { "docid": "1_72", "label2": ["geometry", 1.2] }
3278 ]))
3279 .unwrap();
3280
3281 let mut wtxn = index.write_txn().unwrap();
3282 let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
3283 wtxn.commit().unwrap();
3284
3285 let rtxn = index.read_txn().unwrap();
3287 let results = index.search(&rtxn).query("abstract").execute().unwrap();
3288 assert!(!results.documents_ids.is_empty());
3289 for id in results.documents_ids.iter() {
3290 assert!(
3291 !deleted_internal_ids.contains(id),
3292 "The document {} was supposed to be deleted",
3293 id
3294 );
3295 }
3296 }
3297
3298 #[test]
3299 fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
3300 let index = TempIndex::new();
3301
3302 let mut wtxn = index.write_txn().unwrap();
3303 index
3304 .update_settings_using_wtxn(&mut wtxn, |settings| {
3305 settings.set_primary_key(S("id"));
3306 settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
3307 RESERVED_GEO_FIELD_NAME.to_string(),
3308 )]);
3309 settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME)));
3310 })
3311 .unwrap();
3312 wtxn.commit().unwrap();
3313
3314 let mut wtxn = index.write_txn().unwrap();
3315 index.add_documents_using_wtxn(&mut wtxn, documents!([
3316 { "id": "1", "city": "Lille", RESERVED_GEO_FIELD_NAME: { "lat": 50.6299, "lng": 3.0569 } },
3317 { "id": "2", "city": "Mons-en-Barœul", RESERVED_GEO_FIELD_NAME: { "lat": 50.6415, "lng": 3.1106 } },
3318 { "id": "3", "city": "Hellemmes", RESERVED_GEO_FIELD_NAME: { "lat": 50.6312, "lng": 3.1106 } },
3319 { "id": "4", "city": "Villeneuve-d'Ascq", RESERVED_GEO_FIELD_NAME: { "lat": 50.6224, "lng": 3.1476 } },
3320 { "id": "5", "city": "Hem", RESERVED_GEO_FIELD_NAME: { "lat": 50.6552, "lng": 3.1897 } },
3321 { "id": "6", "city": "Roubaix", RESERVED_GEO_FIELD_NAME: { "lat": 50.6924, "lng": 3.1763 } },
3322 { "id": "7", "city": "Tourcoing", RESERVED_GEO_FIELD_NAME: { "lat": 50.7263, "lng": 3.1541 } },
3323 { "id": "8", "city": "Mouscron", RESERVED_GEO_FIELD_NAME: { "lat": 50.7453, "lng": 3.2206 } },
3324 { "id": "9", "city": "Tournai", RESERVED_GEO_FIELD_NAME: { "lat": 50.6053, "lng": 3.3758 } },
3325 { "id": "10", "city": "Ghent", RESERVED_GEO_FIELD_NAME: { "lat": 51.0537, "lng": 3.6957 } },
3326 { "id": "11", "city": "Brussels", RESERVED_GEO_FIELD_NAME: { "lat": 50.8466, "lng": 4.3370 } },
3327 { "id": "12", "city": "Charleroi", RESERVED_GEO_FIELD_NAME: { "lat": 50.4095, "lng": 4.4347 } },
3328 { "id": "13", "city": "Mons", RESERVED_GEO_FIELD_NAME: { "lat": 50.4502, "lng": 3.9623 } },
3329 { "id": "14", "city": "Valenciennes", RESERVED_GEO_FIELD_NAME: { "lat": 50.3518, "lng": 3.5326 } },
3330 { "id": "15", "city": "Arras", RESERVED_GEO_FIELD_NAME: { "lat": 50.2844, "lng": 2.7637 } },
3331 { "id": "16", "city": "Cambrai", RESERVED_GEO_FIELD_NAME: { "lat": 50.1793, "lng": 3.2189 } },
3332 { "id": "17", "city": "Bapaume", RESERVED_GEO_FIELD_NAME: { "lat": 50.1112, "lng": 2.8547 } },
3333 { "id": "18", "city": "Amiens", RESERVED_GEO_FIELD_NAME: { "lat": 49.9314, "lng": 2.2710 } },
3334 { "id": "19", "city": "Compiègne", RESERVED_GEO_FIELD_NAME: { "lat": 49.4449, "lng": 2.7913 } },
3335 { "id": "20", "city": "Paris", RESERVED_GEO_FIELD_NAME: { "lat": 48.9021, "lng": 2.3708 } }
3336 ])).unwrap();
3337 wtxn.commit().unwrap();
3338
3339 let mut wtxn = index.write_txn().unwrap();
3340 let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
3341 let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
3342
3343 let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
3345 let results = index.search(&wtxn).filter(filter).execute().unwrap();
3346 assert!(!results.documents_ids.is_empty());
3347 for id in results.documents_ids.iter() {
3348 assert!(
3349 !deleted_internal_ids.contains(id),
3350 "The document {} was supposed to be deleted",
3351 id
3352 );
3353 }
3354
3355 wtxn.commit().unwrap();
3356
3357 db_snap!(index, facet_id_f64_docids);
3358 db_snap!(index, facet_id_string_docids);
3359 }
3360
3361 #[test]
3362 fn get_documents_should_not_return_deleted_documents() {
3363 let index = TempIndex::new();
3364
3365 let mut wtxn = index.write_txn().unwrap();
3366 index
3367 .update_settings_using_wtxn(&mut wtxn, |settings| {
3368 settings.set_primary_key(S("docid"));
3369 })
3370 .unwrap();
3371
3372 index
3373 .add_documents_using_wtxn(
3374 &mut wtxn,
3375 documents!([
3376 { "docid": "1_4", "label": ["sign"] },
3377 { "docid": "1_5", "label": ["letter"] },
3378 { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
3379 { "docid": "1_36", "label": ["drawing","painting","pattern"] },
3380 { "docid": "1_37", "label": ["art","drawing","outdoor"] },
3381 { "docid": "1_38", "label": ["aquarium","art","drawing"] },
3382 { "docid": "1_39", "label": ["abstract"] },
3383 { "docid": "1_40", "label": ["cartoon"] },
3384 { "docid": "1_41", "label": ["art","drawing"] },
3385 { "docid": "1_42", "label": ["art","pattern"] },
3386 { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
3387 { "docid": "1_44", "label": ["drawing"] },
3388 { "docid": "1_45", "label": ["art"] },
3389 { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
3390 { "docid": "1_47", "label": ["abstract","pattern"] },
3391 { "docid": "1_52", "label": ["abstract","cartoon"] },
3392 { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
3393 { "docid": "1_58", "label": ["abstract","art","cartoon"] },
3394 { "docid": "1_68", "label": ["design"] },
3395 { "docid": "1_69", "label": ["geometry"] },
3396 { "docid": "1_70", "label2": ["geometry", 1.2] },
3397 { "docid": "1_71", "label2": ["design", 2.2] },
3398 { "docid": "1_72", "label2": ["geometry", 1.2] }
3399 ]),
3400 )
3401 .unwrap();
3402 wtxn.commit().unwrap();
3403
3404 let mut wtxn = index.write_txn().unwrap();
3405 let deleted_external_ids = ["1_7", "1_52"];
3406 let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
3407 wtxn.commit().unwrap();
3408
3409 let rtxn = index.read_txn().unwrap();
3410 let results = index.all_documents(&rtxn).unwrap();
3412 for result in results {
3413 let (id, _) = result.unwrap();
3414 assert!(
3415 !deleted_internal_ids.contains(&id),
3416 "The document {} was supposed to be deleted",
3417 id
3418 );
3419 }
3420
3421 let results = index.documents_ids(&rtxn).unwrap();
3423 for id in results {
3424 assert!(
3425 !deleted_internal_ids.contains(&id),
3426 "The document {} was supposed to be deleted",
3427 id
3428 );
3429 }
3430
3431 let results = index.external_documents_ids();
3433 for id in deleted_external_ids {
3434 assert!(
3435 results.get(&rtxn, id).unwrap().is_none(),
3436 "The document {} was supposed to be deleted",
3437 id
3438 );
3439 }
3440 drop(rtxn);
3441 }
3442
3443 #[test]
3444 fn stats_should_not_return_deleted_documents() {
3445 let index = TempIndex::new();
3446
3447 index
3448 .update_settings(|settings| {
3449 settings.set_primary_key(S("docid"));
3450 })
3451 .unwrap();
3452
3453 index.add_documents(documents!([
3454 { "docid": "1_4", "label": ["sign"]},
3455 { "docid": "1_5", "label": ["letter"]},
3456 { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
3457 { "docid": "1_36", "label": ["drawing","painting","pattern"]},
3458 { "docid": "1_37", "label": ["art","drawing","outdoor"]},
3459 { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
3460 { "docid": "1_39", "label": ["abstract"]},
3461 { "docid": "1_40", "label": ["cartoon"]},
3462 { "docid": "1_41", "label": ["art","drawing"]},
3463 { "docid": "1_42", "label": ["art","pattern"]},
3464 { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
3465 { "docid": "1_44", "label": ["drawing"], "number": 44i32},
3466 { "docid": "1_45", "label": ["art"]},
3467 { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
3468 { "docid": "1_47", "label": ["abstract","pattern"]},
3469 { "docid": "1_52", "label": ["abstract","cartoon"]},
3470 { "docid": "1_57", "label": ["abstract","drawing","pattern"]},
3471 { "docid": "1_58", "label": ["abstract","art","cartoon"]},
3472 { "docid": "1_68", "label": ["design"]},
3473 { "docid": "1_69", "label": ["geometry"]}
3474 ])).unwrap();
3475
3476 let mut wtxn = index.write_txn().unwrap();
3477
3478 delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
3479 wtxn.commit().unwrap();
3480
3481 let rtxn = index.read_txn().unwrap();
3482
3483 let results = index.number_of_documents(&rtxn).unwrap();
3485 assert_eq!(18, results);
3486
3487 let results = index.field_distribution(&rtxn).unwrap();
3489 assert_eq!(Some(&18), results.get("label"));
3490 assert_eq!(Some(&1), results.get("title"));
3491 assert_eq!(Some(&2), results.get("number"));
3492
3493 rtxn.commit().unwrap();
3494 }
3495
3496 #[test]
3497 fn incremental_update_without_changing_facet_distribution() {
3498 let index = TempIndex::new();
3499 index
3500 .add_documents(documents!([
3501 {"id": 0, "some_field": "aaa", "other_field": "aaa" },
3502 {"id": 1, "some_field": "bbb", "other_field": "bbb" },
3503 ]))
3504 .unwrap();
3505 {
3506 let rtxn = index.read_txn().unwrap();
3507 let results = index.field_distribution(&rtxn).unwrap();
3509 assert_eq!(Some(&2), results.get("id"));
3510 assert_eq!(Some(&2), results.get("some_field"));
3511 assert_eq!(Some(&2), results.get("other_field"));
3512 }
3513
3514 let mut index = index;
3515 index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
3516
3517 index
3518 .add_documents(documents!([
3519 {"id": 0, "other_field": "bbb" },
3520 {"id": 1, "some_field": "ccc" },
3521 ]))
3522 .unwrap();
3523
3524 {
3525 let rtxn = index.read_txn().unwrap();
3526 let results = index.field_distribution(&rtxn).unwrap();
3528 assert_eq!(Some(&2), results.get("id"));
3529 assert_eq!(Some(&2), results.get("some_field"));
3530 assert_eq!(Some(&2), results.get("other_field"));
3531 }
3532 }
3533
3534 #[test]
3535 fn delete_words_exact_attributes() {
3536 let index = TempIndex::new();
3537
3538 index
3539 .update_settings(|settings| {
3540 settings.set_primary_key(S("id"));
3541 settings.set_searchable_fields(vec![S("text"), S("exact")]);
3542 settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
3543 })
3544 .unwrap();
3545
3546 index
3547 .add_documents(documents!([
3548 { "id": 0, "text": "hello" },
3549 { "id": 1, "exact": "hello"}
3550 ]))
3551 .unwrap();
3552 db_snap!(index, word_docids, 1, @r###"
3553 hello [0, ]
3554 "###);
3555 db_snap!(index, exact_word_docids, 1, @r###"
3556 hello [1, ]
3557 "###);
3558 db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
3559
3560 let mut wtxn = index.write_txn().unwrap();
3561 let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
3562 wtxn.commit().unwrap();
3563
3564 db_snap!(index, word_docids, 2, @r###"
3565 hello [0, ]
3566 "###);
3567 db_snap!(index, exact_word_docids, 2, @"");
3568 db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
3569
3570 insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
3571 let txn = index.read_txn().unwrap();
3572 let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
3573 insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
3574
3575 let mut s = Search::new(&txn, &index);
3576 s.query("hello");
3577 let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
3578 insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
3579 }
3580}