1use std::sync::Arc;
11
12use rustc_hash::FxHashMap;
13
14use crate::dsl::Schema;
15use crate::error::Result;
16use crate::structures::{CoarseCentroids, PQCodebook};
17
18#[cfg(feature = "native")]
19mod reader;
20#[cfg(feature = "native")]
21mod vector_builder;
22#[cfg(feature = "native")]
23mod writer;
24#[cfg(feature = "native")]
25pub use reader::{IndexReader, Searcher};
26#[cfg(feature = "native")]
27pub use writer::IndexWriter;
28
29mod metadata;
30pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
31
32#[cfg(feature = "native")]
33mod helpers;
34#[cfg(feature = "native")]
35pub use helpers::{
36 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
37 index_documents_from_reader, index_json_document, parse_schema,
38};
39
40pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
42
43#[derive(Debug, Clone)]
45pub struct IndexConfig {
46 pub num_threads: usize,
48 pub num_indexing_threads: usize,
50 pub num_compression_threads: usize,
52 pub term_cache_blocks: usize,
54 pub store_cache_blocks: usize,
56 pub max_indexing_memory_bytes: usize,
58 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
60 pub optimization: crate::structures::IndexOptimization,
62}
63
64impl Default for IndexConfig {
65 fn default() -> Self {
66 #[cfg(feature = "native")]
67 let cpus = num_cpus::get().max(1);
68 #[cfg(not(feature = "native"))]
69 let cpus = 1;
70
71 Self {
72 num_threads: cpus,
73 num_indexing_threads: 1,
74 num_compression_threads: cpus,
75 term_cache_blocks: 256,
76 store_cache_blocks: 32,
77 max_indexing_memory_bytes: 2 * 1024 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
79 optimization: crate::structures::IndexOptimization::default(),
80 }
81 }
82}
83
84#[cfg(feature = "native")]
93pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
94 directory: Arc<D>,
95 schema: Arc<Schema>,
96 config: IndexConfig,
97 segment_manager: Arc<crate::merge::SegmentManager<D>>,
99 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
101 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
103}
104
105#[cfg(feature = "native")]
106impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
107 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
109 let directory = Arc::new(directory);
110 let schema = Arc::new(schema);
111 let metadata = IndexMetadata::new((*schema).clone());
112
113 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
114 Arc::clone(&directory),
115 Arc::clone(&schema),
116 metadata,
117 config.merge_policy.clone_box(),
118 config.term_cache_blocks,
119 ));
120
121 segment_manager.update_metadata(|_| {}).await?;
123
124 Ok(Self {
125 directory,
126 schema,
127 config,
128 segment_manager,
129 trained_centroids: FxHashMap::default(),
130 trained_codebooks: FxHashMap::default(),
131 })
132 }
133
134 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
136 let directory = Arc::new(directory);
137
138 let metadata = IndexMetadata::load(directory.as_ref()).await?;
140 let schema = Arc::new(metadata.schema.clone());
141
142 let (trained_centroids, trained_codebooks) =
144 metadata.load_trained_structures(directory.as_ref()).await;
145
146 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
147 Arc::clone(&directory),
148 Arc::clone(&schema),
149 metadata,
150 config.merge_policy.clone_box(),
151 config.term_cache_blocks,
152 ));
153
154 Ok(Self {
155 directory,
156 schema,
157 config,
158 segment_manager,
159 trained_centroids,
160 trained_codebooks,
161 })
162 }
163
164 pub fn schema(&self) -> &Schema {
166 &self.schema
167 }
168
169 pub fn directory(&self) -> &D {
171 &self.directory
172 }
173
174 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
176 &self.segment_manager
177 }
178
179 pub fn writer(&self) -> writer::IndexWriter<D> {
181 writer::IndexWriter::from_index(self)
182 }
183
184 pub async fn reader(&self) -> Result<IndexReader<D>> {
186 IndexReader::from_segment_manager(
187 Arc::clone(&self.schema),
188 Arc::clone(&self.segment_manager),
189 self.trained_centroids.clone(),
190 self.trained_codebooks.clone(),
191 self.config.term_cache_blocks,
192 )
193 .await
194 }
195
196 pub fn config(&self) -> &IndexConfig {
198 &self.config
199 }
200
201 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
203 &self.trained_centroids
204 }
205
206 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
208 &self.trained_codebooks
209 }
210
211 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
215 let reader = self.reader().await?;
216 let searcher = reader.searcher().await?;
217 Ok(searcher.segment_readers().to_vec())
218 }
219
220 pub async fn num_docs(&self) -> Result<u32> {
222 let reader = self.reader().await?;
223 let searcher = reader.searcher().await?;
224 Ok(searcher.num_docs())
225 }
226
227 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
229 let reader = self.reader().await?;
230 let searcher = reader.searcher().await?;
231 searcher.doc(doc_id).await
232 }
233
234 pub fn default_fields(&self) -> Vec<crate::Field> {
236 if !self.schema.default_fields().is_empty() {
237 self.schema.default_fields().to_vec()
238 } else {
239 self.schema
240 .fields()
241 .filter(|(_, entry)| {
242 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
243 })
244 .map(|(field, _)| field)
245 .collect()
246 }
247 }
248
249 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
251 Arc::new(crate::tokenizer::TokenizerRegistry::default())
252 }
253
254 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
256 let default_fields = self.default_fields();
257 let tokenizers = self.tokenizers();
258
259 let query_routers = self.schema.query_routers();
260 if !query_routers.is_empty()
261 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
262 {
263 return crate::dsl::QueryLanguageParser::with_router(
264 Arc::clone(&self.schema),
265 default_fields,
266 tokenizers,
267 router,
268 );
269 }
270
271 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
272 }
273
274 pub async fn query(
276 &self,
277 query_str: &str,
278 limit: usize,
279 ) -> Result<crate::query::SearchResponse> {
280 self.query_offset(query_str, limit, 0).await
281 }
282
283 pub async fn query_offset(
285 &self,
286 query_str: &str,
287 limit: usize,
288 offset: usize,
289 ) -> Result<crate::query::SearchResponse> {
290 let parser = self.query_parser();
291 let query = parser
292 .parse(query_str)
293 .map_err(crate::error::Error::Query)?;
294 self.search_offset(query.as_ref(), limit, offset).await
295 }
296
297 pub async fn search(
299 &self,
300 query: &dyn crate::query::Query,
301 limit: usize,
302 ) -> Result<crate::query::SearchResponse> {
303 self.search_offset(query, limit, 0).await
304 }
305
306 pub async fn search_offset(
308 &self,
309 query: &dyn crate::query::Query,
310 limit: usize,
311 offset: usize,
312 ) -> Result<crate::query::SearchResponse> {
313 let reader = self.reader().await?;
314 let searcher = reader.searcher().await?;
315 let segments = searcher.segment_readers();
316
317 let mut all_results: Vec<(u128, crate::query::SearchResult)> = Vec::new();
318 let fetch_limit = offset + limit;
319
320 for segment in segments {
321 let segment_id = segment.meta().id;
322 let results =
323 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
324 for result in results {
325 all_results.push((segment_id, result));
326 }
327 }
328
329 all_results.sort_by(|a, b| {
330 b.1.score
331 .partial_cmp(&a.1.score)
332 .unwrap_or(std::cmp::Ordering::Equal)
333 });
334
335 let total_hits = all_results.len() as u32;
336
337 let hits: Vec<crate::query::SearchHit> = all_results
338 .into_iter()
339 .skip(offset)
340 .take(limit)
341 .map(|(segment_id, result)| crate::query::SearchHit {
342 address: crate::query::DocAddress::new(segment_id, result.doc_id),
343 score: result.score,
344 matched_fields: result.extract_ordinals(),
345 })
346 .collect();
347
348 Ok(crate::query::SearchResponse { hits, total_hits })
349 }
350
351 pub async fn get_document(
353 &self,
354 address: &crate::query::DocAddress,
355 ) -> Result<Option<crate::dsl::Document>> {
356 let segment_id = address.segment_id_u128().ok_or_else(|| {
357 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
358 })?;
359
360 let reader = self.reader().await?;
361 let searcher = reader.searcher().await?;
362
363 for segment in searcher.segment_readers() {
364 if segment.meta().id == segment_id {
365 return segment.doc(address.doc_id).await;
366 }
367 }
368
369 Ok(None)
370 }
371
372 pub async fn reload(&self) -> Result<()> {
374 Ok(())
376 }
377
378 pub async fn get_postings(
380 &self,
381 field: crate::Field,
382 term: &[u8],
383 ) -> Result<
384 Vec<(
385 Arc<crate::segment::SegmentReader>,
386 crate::structures::BlockPostingList,
387 )>,
388 > {
389 let segments = self.segment_readers().await?;
390 let mut results = Vec::new();
391
392 for segment in segments {
393 if let Some(postings) = segment.get_postings(field, term).await? {
394 results.push((segment, postings));
395 }
396 }
397
398 Ok(results)
399 }
400}
401
402#[cfg(test)]
405mod tests {
406 use super::*;
407 use crate::directories::RamDirectory;
408 use crate::dsl::{Document, SchemaBuilder};
409
410 #[tokio::test]
411 async fn test_index_create_and_search() {
412 let mut schema_builder = SchemaBuilder::default();
413 let title = schema_builder.add_text_field("title", true, true);
414 let body = schema_builder.add_text_field("body", true, true);
415 let schema = schema_builder.build();
416
417 let dir = RamDirectory::new();
418 let config = IndexConfig::default();
419
420 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
422 .await
423 .unwrap();
424
425 let mut doc1 = Document::new();
426 doc1.add_text(title, "Hello World");
427 doc1.add_text(body, "This is the first document");
428 writer.add_document(doc1).unwrap();
429
430 let mut doc2 = Document::new();
431 doc2.add_text(title, "Goodbye World");
432 doc2.add_text(body, "This is the second document");
433 writer.add_document(doc2).unwrap();
434
435 writer.commit().await.unwrap();
436
437 let index = Index::open(dir, config).await.unwrap();
439 assert_eq!(index.num_docs().await.unwrap(), 2);
440
441 let postings = index.get_postings(title, b"world").await.unwrap();
443 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
448 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
449 }
450
451 #[tokio::test]
452 async fn test_multiple_segments() {
453 let mut schema_builder = SchemaBuilder::default();
454 let title = schema_builder.add_text_field("title", true, true);
455 let schema = schema_builder.build();
456
457 let dir = RamDirectory::new();
458 let config = IndexConfig {
459 max_indexing_memory_bytes: 1024, ..Default::default()
461 };
462
463 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
464 .await
465 .unwrap();
466
467 for batch in 0..3 {
469 for i in 0..5 {
470 let mut doc = Document::new();
471 doc.add_text(title, format!("Document {} batch {}", i, batch));
472 writer.add_document(doc).unwrap();
473 }
474 writer.commit().await.unwrap();
475 }
476
477 let index = Index::open(dir, config).await.unwrap();
479 assert_eq!(index.num_docs().await.unwrap(), 15);
480 assert!(
482 index.segment_readers().await.unwrap().len() >= 2,
483 "Expected multiple segments"
484 );
485 }
486
487 #[tokio::test]
488 async fn test_segment_merge() {
489 let mut schema_builder = SchemaBuilder::default();
490 let title = schema_builder.add_text_field("title", true, true);
491 let schema = schema_builder.build();
492
493 let dir = RamDirectory::new();
494 let config = IndexConfig {
495 max_indexing_memory_bytes: 512, ..Default::default()
497 };
498
499 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
500 .await
501 .unwrap();
502
503 for batch in 0..3 {
505 for i in 0..3 {
506 let mut doc = Document::new();
507 doc.add_text(title, format!("Document {} batch {}", i, batch));
508 writer.add_document(doc).unwrap();
509 }
510 writer.flush().await.unwrap();
511 }
512 writer.commit().await.unwrap();
513
514 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
516 assert!(
517 index.segment_readers().await.unwrap().len() >= 2,
518 "Expected multiple segments"
519 );
520
521 let writer = IndexWriter::open(dir.clone(), config.clone())
523 .await
524 .unwrap();
525 writer.force_merge().await.unwrap();
526
527 let index = Index::open(dir, config).await.unwrap();
529 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
530 assert_eq!(index.num_docs().await.unwrap(), 9);
531
532 let mut found_docs = 0;
534 for i in 0..9 {
535 if index.doc(i).await.unwrap().is_some() {
536 found_docs += 1;
537 }
538 }
539 assert_eq!(found_docs, 9);
540 }
541
542 #[tokio::test]
543 async fn test_match_query() {
544 let mut schema_builder = SchemaBuilder::default();
545 let title = schema_builder.add_text_field("title", true, true);
546 let body = schema_builder.add_text_field("body", true, true);
547 let schema = schema_builder.build();
548
549 let dir = RamDirectory::new();
550 let config = IndexConfig::default();
551
552 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
553 .await
554 .unwrap();
555
556 let mut doc1 = Document::new();
557 doc1.add_text(title, "rust programming");
558 doc1.add_text(body, "Learn rust language");
559 writer.add_document(doc1).unwrap();
560
561 let mut doc2 = Document::new();
562 doc2.add_text(title, "python programming");
563 doc2.add_text(body, "Learn python language");
564 writer.add_document(doc2).unwrap();
565
566 writer.commit().await.unwrap();
567
568 let index = Index::open(dir, config).await.unwrap();
569
570 let results = index.query("rust", 10).await.unwrap();
572 assert_eq!(results.hits.len(), 1);
573
574 let results = index.query("rust programming", 10).await.unwrap();
576 assert!(!results.hits.is_empty());
577
578 let hit = &results.hits[0];
580 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
581
582 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
584 assert!(
585 !doc.field_values().is_empty(),
586 "Doc should have field values"
587 );
588
589 let doc = index.doc(0).await.unwrap().unwrap();
591 assert!(
592 !doc.field_values().is_empty(),
593 "Doc should have field values"
594 );
595 }
596
597 #[tokio::test]
598 async fn test_slice_cache_warmup_and_load() {
599 use crate::directories::SliceCachingDirectory;
600
601 let mut schema_builder = SchemaBuilder::default();
602 let title = schema_builder.add_text_field("title", true, true);
603 let body = schema_builder.add_text_field("body", true, true);
604 let schema = schema_builder.build();
605
606 let dir = RamDirectory::new();
607 let config = IndexConfig::default();
608
609 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
611 .await
612 .unwrap();
613
614 for i in 0..10 {
615 let mut doc = Document::new();
616 doc.add_text(title, format!("Document {} about rust", i));
617 doc.add_text(body, format!("This is body text number {}", i));
618 writer.add_document(doc).unwrap();
619 }
620 writer.commit().await.unwrap();
621
622 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
624 let index = Index::open(caching_dir, config.clone()).await.unwrap();
625
626 let results = index.query("rust", 10).await.unwrap();
628 assert!(!results.hits.is_empty());
629
630 let stats = index.directory.stats();
632 assert!(stats.total_bytes > 0, "Cache should have data after search");
633 }
634
635 #[tokio::test]
636 async fn test_multivalue_field_indexing_and_search() {
637 let mut schema_builder = SchemaBuilder::default();
638 let uris = schema_builder.add_text_field("uris", true, true);
639 let title = schema_builder.add_text_field("title", true, true);
640 let schema = schema_builder.build();
641
642 let dir = RamDirectory::new();
643 let config = IndexConfig::default();
644
645 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
647 .await
648 .unwrap();
649
650 let mut doc = Document::new();
651 doc.add_text(uris, "one");
652 doc.add_text(uris, "two");
653 doc.add_text(title, "Test Document");
654 writer.add_document(doc).unwrap();
655
656 let mut doc2 = Document::new();
658 doc2.add_text(uris, "three");
659 doc2.add_text(title, "Another Document");
660 writer.add_document(doc2).unwrap();
661
662 writer.commit().await.unwrap();
663
664 let index = Index::open(dir, config).await.unwrap();
666 assert_eq!(index.num_docs().await.unwrap(), 2);
667
668 let doc = index.doc(0).await.unwrap().unwrap();
670 let all_uris: Vec<_> = doc.get_all(uris).collect();
671 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
672 assert_eq!(all_uris[0].as_text(), Some("one"));
673 assert_eq!(all_uris[1].as_text(), Some("two"));
674
675 let json = doc.to_json(index.schema());
677 let uris_json = json.get("uris").unwrap();
678 assert!(uris_json.is_array(), "Multi-value field should be an array");
679 let uris_arr = uris_json.as_array().unwrap();
680 assert_eq!(uris_arr.len(), 2);
681 assert_eq!(uris_arr[0].as_str(), Some("one"));
682 assert_eq!(uris_arr[1].as_str(), Some("two"));
683
684 let results = index.query("uris:one", 10).await.unwrap();
686 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
687 assert_eq!(results.hits[0].address.doc_id, 0);
688
689 let results = index.query("uris:two", 10).await.unwrap();
690 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
691 assert_eq!(results.hits[0].address.doc_id, 0);
692
693 let results = index.query("uris:three", 10).await.unwrap();
694 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
695 assert_eq!(results.hits[0].address.doc_id, 1);
696
697 let results = index.query("uris:nonexistent", 10).await.unwrap();
699 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
700 }
701
702 #[tokio::test]
709 async fn test_wand_optimization_for_or_queries() {
710 use crate::query::{BooleanQuery, TermQuery};
711
712 let mut schema_builder = SchemaBuilder::default();
713 let content = schema_builder.add_text_field("content", true, true);
714 let schema = schema_builder.build();
715
716 let dir = RamDirectory::new();
717 let config = IndexConfig::default();
718
719 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
721 .await
722 .unwrap();
723
724 let mut doc = Document::new();
726 doc.add_text(content, "rust programming language is fast");
727 writer.add_document(doc).unwrap();
728
729 let mut doc = Document::new();
731 doc.add_text(content, "rust is a systems language");
732 writer.add_document(doc).unwrap();
733
734 let mut doc = Document::new();
736 doc.add_text(content, "programming is fun");
737 writer.add_document(doc).unwrap();
738
739 let mut doc = Document::new();
741 doc.add_text(content, "python is easy to learn");
742 writer.add_document(doc).unwrap();
743
744 let mut doc = Document::new();
746 doc.add_text(content, "rust rust programming programming systems");
747 writer.add_document(doc).unwrap();
748
749 writer.commit().await.unwrap();
750
751 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
753
754 let or_query = BooleanQuery::new()
756 .should(TermQuery::text(content, "rust"))
757 .should(TermQuery::text(content, "programming"));
758
759 let results = index.search(&or_query, 10).await.unwrap();
760
761 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
763
764 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
765 assert!(doc_ids.contains(&0), "Should find doc 0");
766 assert!(doc_ids.contains(&1), "Should find doc 1");
767 assert!(doc_ids.contains(&2), "Should find doc 2");
768 assert!(doc_ids.contains(&4), "Should find doc 4");
769 assert!(
770 !doc_ids.contains(&3),
771 "Should NOT find doc 3 (only has 'python')"
772 );
773
774 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
776
777 let results = index.search(&single_query, 10).await.unwrap();
778 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
779
780 let must_query = BooleanQuery::new()
782 .must(TermQuery::text(content, "rust"))
783 .should(TermQuery::text(content, "programming"));
784
785 let results = index.search(&must_query, 10).await.unwrap();
786 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
788
789 let must_not_query = BooleanQuery::new()
791 .should(TermQuery::text(content, "rust"))
792 .should(TermQuery::text(content, "programming"))
793 .must_not(TermQuery::text(content, "systems"));
794
795 let results = index.search(&must_not_query, 10).await.unwrap();
796 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
798 assert!(
799 !doc_ids.contains(&1),
800 "Should NOT find doc 1 (has 'systems')"
801 );
802 assert!(
803 !doc_ids.contains(&4),
804 "Should NOT find doc 4 (has 'systems')"
805 );
806
807 let or_query = BooleanQuery::new()
809 .should(TermQuery::text(content, "rust"))
810 .should(TermQuery::text(content, "programming"));
811
812 let results = index.search(&or_query, 2).await.unwrap();
813 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
814
815 }
818
819 #[tokio::test]
821 async fn test_wand_results_match_standard_boolean() {
822 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
823
824 let mut schema_builder = SchemaBuilder::default();
825 let content = schema_builder.add_text_field("content", true, true);
826 let schema = schema_builder.build();
827
828 let dir = RamDirectory::new();
829 let config = IndexConfig::default();
830
831 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
832 .await
833 .unwrap();
834
835 for i in 0..10 {
837 let mut doc = Document::new();
838 let text = match i % 4 {
839 0 => "apple banana cherry",
840 1 => "apple orange",
841 2 => "banana grape",
842 _ => "cherry date",
843 };
844 doc.add_text(content, text);
845 writer.add_document(doc).unwrap();
846 }
847
848 writer.commit().await.unwrap();
849 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
850
851 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
853
854 let bool_query = BooleanQuery::new()
855 .should(TermQuery::text(content, "apple"))
856 .should(TermQuery::text(content, "banana"));
857
858 let wand_results = index.search(&wand_query, 10).await.unwrap();
859 let bool_results = index.search(&bool_query, 10).await.unwrap();
860
861 assert_eq!(
863 wand_results.hits.len(),
864 bool_results.hits.len(),
865 "WAND and Boolean should find same number of docs"
866 );
867
868 let wand_docs: std::collections::HashSet<u32> =
869 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
870 let bool_docs: std::collections::HashSet<u32> =
871 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
872
873 assert_eq!(
874 wand_docs, bool_docs,
875 "WAND and Boolean should find same documents"
876 );
877 }
878
879 #[tokio::test]
880 async fn test_vector_index_threshold_switch() {
881 use crate::dsl::{DenseVectorConfig, VectorIndexType};
882
883 let mut schema_builder = SchemaBuilder::default();
885 let title = schema_builder.add_text_field("title", true, true);
886 let embedding = schema_builder.add_dense_vector_field_with_config(
887 "embedding",
888 true, true, DenseVectorConfig {
891 dim: 8,
892 index_type: VectorIndexType::IvfRaBitQ,
893 store_raw: true,
894 num_clusters: Some(4), nprobe: 2,
896 mrl_dim: None,
897 build_threshold: Some(50), },
899 );
900 let schema = schema_builder.build();
901
902 let dir = RamDirectory::new();
903 let config = IndexConfig::default();
904
905 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
907 .await
908 .unwrap();
909
910 for i in 0..30 {
912 let mut doc = Document::new();
913 doc.add_text(title, format!("Document {}", i));
914 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
916 doc.add_dense_vector(embedding, vec);
917 writer.add_document(doc).unwrap();
918 }
919 writer.commit().await.unwrap();
920
921 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
923 assert!(
924 index.trained_centroids.is_empty(),
925 "Should not have trained centroids below threshold"
926 );
927
928 let query_vec: Vec<f32> = vec![0.5; 8];
930 let segments = index.segment_readers().await.unwrap();
931 assert!(!segments.is_empty());
932
933 let results = segments[0]
934 .search_dense_vector(
935 embedding,
936 &query_vec,
937 5,
938 1,
939 crate::query::MultiValueCombiner::Max,
940 )
941 .unwrap();
942 assert!(!results.is_empty(), "Flat search should return results");
943
944 let writer = IndexWriter::open(dir.clone(), config.clone())
946 .await
947 .unwrap();
948
949 for i in 30..60 {
951 let mut doc = Document::new();
952 doc.add_text(title, format!("Document {}", i));
953 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
954 doc.add_dense_vector(embedding, vec);
955 writer.add_document(doc).unwrap();
956 }
957 writer.commit().await.unwrap();
959
960 assert!(
962 writer.is_vector_index_built(embedding).await,
963 "Vector index should be built after crossing threshold"
964 );
965
966 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
968 assert!(
969 index.trained_centroids.contains_key(&embedding.0),
970 "Should have loaded trained centroids for embedding field"
971 );
972
973 let segments = index.segment_readers().await.unwrap();
975 let results = segments[0]
976 .search_dense_vector(
977 embedding,
978 &query_vec,
979 5,
980 1,
981 crate::query::MultiValueCombiner::Max,
982 )
983 .unwrap();
984 assert!(
985 !results.is_empty(),
986 "Search should return results after build"
987 );
988
989 let writer = IndexWriter::open(dir.clone(), config.clone())
991 .await
992 .unwrap();
993 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
997 }
998}