1#[cfg(feature = "native")]
11use std::sync::Arc;
12
13#[cfg(feature = "native")]
14use rustc_hash::FxHashMap;
15
16#[cfg(feature = "native")]
17use crate::dsl::Schema;
18#[cfg(feature = "native")]
19use crate::error::Result;
20#[cfg(feature = "native")]
21use crate::structures::{CoarseCentroids, PQCodebook};
22
23#[cfg(feature = "native")]
24mod reader;
25#[cfg(feature = "native")]
26mod vector_builder;
27#[cfg(feature = "native")]
28mod writer;
29#[cfg(feature = "native")]
30pub use reader::{IndexReader, Searcher};
31#[cfg(feature = "native")]
32pub use writer::IndexWriter;
33
34mod metadata;
35pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
36
37#[cfg(feature = "native")]
38mod helpers;
39#[cfg(feature = "native")]
40pub use helpers::{
41 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
42 index_documents_from_reader, index_json_document, parse_schema,
43};
44
45pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
47
48#[derive(Debug, Clone)]
50pub struct IndexConfig {
51 pub num_threads: usize,
53 pub num_indexing_threads: usize,
55 pub num_compression_threads: usize,
57 pub term_cache_blocks: usize,
59 pub store_cache_blocks: usize,
61 pub max_indexing_memory_bytes: usize,
63 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
65 pub optimization: crate::structures::IndexOptimization,
67}
68
69impl Default for IndexConfig {
70 fn default() -> Self {
71 #[cfg(feature = "native")]
72 let cpus = num_cpus::get().max(1);
73 #[cfg(not(feature = "native"))]
74 let cpus = 1;
75
76 Self {
77 num_threads: cpus,
78 num_indexing_threads: 1,
79 num_compression_threads: cpus,
80 term_cache_blocks: 256,
81 store_cache_blocks: 32,
82 max_indexing_memory_bytes: 2 * 1024 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
84 optimization: crate::structures::IndexOptimization::default(),
85 }
86 }
87}
88
89#[cfg(feature = "native")]
98pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
99 directory: Arc<D>,
100 schema: Arc<Schema>,
101 config: IndexConfig,
102 segment_manager: Arc<crate::merge::SegmentManager<D>>,
104 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
106 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
108}
109
110#[cfg(feature = "native")]
111impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
112 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
114 let directory = Arc::new(directory);
115 let schema = Arc::new(schema);
116 let metadata = IndexMetadata::new((*schema).clone());
117
118 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
119 Arc::clone(&directory),
120 Arc::clone(&schema),
121 metadata,
122 config.merge_policy.clone_box(),
123 config.term_cache_blocks,
124 ));
125
126 segment_manager.update_metadata(|_| {}).await?;
128
129 Ok(Self {
130 directory,
131 schema,
132 config,
133 segment_manager,
134 trained_centroids: FxHashMap::default(),
135 trained_codebooks: FxHashMap::default(),
136 })
137 }
138
139 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
141 let directory = Arc::new(directory);
142
143 let metadata = IndexMetadata::load(directory.as_ref()).await?;
145 let schema = Arc::new(metadata.schema.clone());
146
147 let (trained_centroids, trained_codebooks) =
149 metadata.load_trained_structures(directory.as_ref()).await;
150
151 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
152 Arc::clone(&directory),
153 Arc::clone(&schema),
154 metadata,
155 config.merge_policy.clone_box(),
156 config.term_cache_blocks,
157 ));
158
159 Ok(Self {
160 directory,
161 schema,
162 config,
163 segment_manager,
164 trained_centroids,
165 trained_codebooks,
166 })
167 }
168
169 pub fn schema(&self) -> &Schema {
171 &self.schema
172 }
173
174 pub fn directory(&self) -> &D {
176 &self.directory
177 }
178
179 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
181 &self.segment_manager
182 }
183
184 pub fn writer(&self) -> writer::IndexWriter<D> {
186 writer::IndexWriter::from_index(self)
187 }
188
189 pub async fn reader(&self) -> Result<IndexReader<D>> {
191 IndexReader::from_segment_manager(
192 Arc::clone(&self.schema),
193 Arc::clone(&self.segment_manager),
194 self.trained_centroids.clone(),
195 self.trained_codebooks.clone(),
196 self.config.term_cache_blocks,
197 )
198 .await
199 }
200
201 pub fn config(&self) -> &IndexConfig {
203 &self.config
204 }
205
206 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
208 &self.trained_centroids
209 }
210
211 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
213 &self.trained_codebooks
214 }
215
216 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
220 let reader = self.reader().await?;
221 let searcher = reader.searcher().await?;
222 Ok(searcher.segment_readers().to_vec())
223 }
224
225 pub async fn num_docs(&self) -> Result<u32> {
227 let reader = self.reader().await?;
228 let searcher = reader.searcher().await?;
229 Ok(searcher.num_docs())
230 }
231
232 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
234 let reader = self.reader().await?;
235 let searcher = reader.searcher().await?;
236 searcher.doc(doc_id).await
237 }
238
239 pub fn default_fields(&self) -> Vec<crate::Field> {
241 if !self.schema.default_fields().is_empty() {
242 self.schema.default_fields().to_vec()
243 } else {
244 self.schema
245 .fields()
246 .filter(|(_, entry)| {
247 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
248 })
249 .map(|(field, _)| field)
250 .collect()
251 }
252 }
253
254 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
256 Arc::new(crate::tokenizer::TokenizerRegistry::default())
257 }
258
259 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
261 let default_fields = self.default_fields();
262 let tokenizers = self.tokenizers();
263
264 let query_routers = self.schema.query_routers();
265 if !query_routers.is_empty()
266 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
267 {
268 return crate::dsl::QueryLanguageParser::with_router(
269 Arc::clone(&self.schema),
270 default_fields,
271 tokenizers,
272 router,
273 );
274 }
275
276 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
277 }
278
279 pub async fn query(
281 &self,
282 query_str: &str,
283 limit: usize,
284 ) -> Result<crate::query::SearchResponse> {
285 self.query_offset(query_str, limit, 0).await
286 }
287
288 pub async fn query_offset(
290 &self,
291 query_str: &str,
292 limit: usize,
293 offset: usize,
294 ) -> Result<crate::query::SearchResponse> {
295 let parser = self.query_parser();
296 let query = parser
297 .parse(query_str)
298 .map_err(crate::error::Error::Query)?;
299 self.search_offset(query.as_ref(), limit, offset).await
300 }
301
302 pub async fn search(
304 &self,
305 query: &dyn crate::query::Query,
306 limit: usize,
307 ) -> Result<crate::query::SearchResponse> {
308 self.search_offset(query, limit, 0).await
309 }
310
311 pub async fn search_offset(
313 &self,
314 query: &dyn crate::query::Query,
315 limit: usize,
316 offset: usize,
317 ) -> Result<crate::query::SearchResponse> {
318 let reader = self.reader().await?;
319 let searcher = reader.searcher().await?;
320 let segments = searcher.segment_readers();
321
322 let mut all_results: Vec<(u128, crate::query::SearchResult)> = Vec::new();
323 let fetch_limit = offset + limit;
324
325 for segment in segments {
326 let segment_id = segment.meta().id;
327 let results =
328 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
329 for result in results {
330 all_results.push((segment_id, result));
331 }
332 }
333
334 all_results.sort_by(|a, b| {
335 b.1.score
336 .partial_cmp(&a.1.score)
337 .unwrap_or(std::cmp::Ordering::Equal)
338 });
339
340 let total_hits = all_results.len() as u32;
341
342 let hits: Vec<crate::query::SearchHit> = all_results
343 .into_iter()
344 .skip(offset)
345 .take(limit)
346 .map(|(segment_id, result)| crate::query::SearchHit {
347 address: crate::query::DocAddress::new(segment_id, result.doc_id),
348 score: result.score,
349 matched_fields: result.extract_ordinals(),
350 })
351 .collect();
352
353 Ok(crate::query::SearchResponse { hits, total_hits })
354 }
355
356 pub async fn get_document(
358 &self,
359 address: &crate::query::DocAddress,
360 ) -> Result<Option<crate::dsl::Document>> {
361 let segment_id = address.segment_id_u128().ok_or_else(|| {
362 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
363 })?;
364
365 let reader = self.reader().await?;
366 let searcher = reader.searcher().await?;
367
368 for segment in searcher.segment_readers() {
369 if segment.meta().id == segment_id {
370 return segment.doc(address.doc_id).await;
371 }
372 }
373
374 Ok(None)
375 }
376
377 pub async fn reload(&self) -> Result<()> {
379 Ok(())
381 }
382
383 pub async fn get_postings(
385 &self,
386 field: crate::Field,
387 term: &[u8],
388 ) -> Result<
389 Vec<(
390 Arc<crate::segment::SegmentReader>,
391 crate::structures::BlockPostingList,
392 )>,
393 > {
394 let segments = self.segment_readers().await?;
395 let mut results = Vec::new();
396
397 for segment in segments {
398 if let Some(postings) = segment.get_postings(field, term).await? {
399 results.push((segment, postings));
400 }
401 }
402
403 Ok(results)
404 }
405}
406
407#[cfg(test)]
410mod tests {
411 use super::*;
412 use crate::directories::RamDirectory;
413 use crate::dsl::{Document, SchemaBuilder};
414
415 #[tokio::test]
416 async fn test_index_create_and_search() {
417 let mut schema_builder = SchemaBuilder::default();
418 let title = schema_builder.add_text_field("title", true, true);
419 let body = schema_builder.add_text_field("body", true, true);
420 let schema = schema_builder.build();
421
422 let dir = RamDirectory::new();
423 let config = IndexConfig::default();
424
425 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
427 .await
428 .unwrap();
429
430 let mut doc1 = Document::new();
431 doc1.add_text(title, "Hello World");
432 doc1.add_text(body, "This is the first document");
433 writer.add_document(doc1).unwrap();
434
435 let mut doc2 = Document::new();
436 doc2.add_text(title, "Goodbye World");
437 doc2.add_text(body, "This is the second document");
438 writer.add_document(doc2).unwrap();
439
440 writer.commit().await.unwrap();
441
442 let index = Index::open(dir, config).await.unwrap();
444 assert_eq!(index.num_docs().await.unwrap(), 2);
445
446 let postings = index.get_postings(title, b"world").await.unwrap();
448 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
453 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
454 }
455
456 #[tokio::test]
457 async fn test_multiple_segments() {
458 let mut schema_builder = SchemaBuilder::default();
459 let title = schema_builder.add_text_field("title", true, true);
460 let schema = schema_builder.build();
461
462 let dir = RamDirectory::new();
463 let config = IndexConfig {
464 max_indexing_memory_bytes: 1024, ..Default::default()
466 };
467
468 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
469 .await
470 .unwrap();
471
472 for batch in 0..3 {
474 for i in 0..5 {
475 let mut doc = Document::new();
476 doc.add_text(title, format!("Document {} batch {}", i, batch));
477 writer.add_document(doc).unwrap();
478 }
479 writer.commit().await.unwrap();
480 }
481
482 let index = Index::open(dir, config).await.unwrap();
484 assert_eq!(index.num_docs().await.unwrap(), 15);
485 assert!(
487 index.segment_readers().await.unwrap().len() >= 2,
488 "Expected multiple segments"
489 );
490 }
491
492 #[tokio::test]
493 async fn test_segment_merge() {
494 let mut schema_builder = SchemaBuilder::default();
495 let title = schema_builder.add_text_field("title", true, true);
496 let schema = schema_builder.build();
497
498 let dir = RamDirectory::new();
499 let config = IndexConfig {
500 max_indexing_memory_bytes: 512, ..Default::default()
502 };
503
504 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
505 .await
506 .unwrap();
507
508 for batch in 0..3 {
510 for i in 0..3 {
511 let mut doc = Document::new();
512 doc.add_text(title, format!("Document {} batch {}", i, batch));
513 writer.add_document(doc).unwrap();
514 }
515 writer.flush().await.unwrap();
516 }
517 writer.commit().await.unwrap();
518
519 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
521 assert!(
522 index.segment_readers().await.unwrap().len() >= 2,
523 "Expected multiple segments"
524 );
525
526 let writer = IndexWriter::open(dir.clone(), config.clone())
528 .await
529 .unwrap();
530 writer.force_merge().await.unwrap();
531
532 let index = Index::open(dir, config).await.unwrap();
534 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
535 assert_eq!(index.num_docs().await.unwrap(), 9);
536
537 let mut found_docs = 0;
539 for i in 0..9 {
540 if index.doc(i).await.unwrap().is_some() {
541 found_docs += 1;
542 }
543 }
544 assert_eq!(found_docs, 9);
545 }
546
547 #[tokio::test]
548 async fn test_match_query() {
549 let mut schema_builder = SchemaBuilder::default();
550 let title = schema_builder.add_text_field("title", true, true);
551 let body = schema_builder.add_text_field("body", true, true);
552 let schema = schema_builder.build();
553
554 let dir = RamDirectory::new();
555 let config = IndexConfig::default();
556
557 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
558 .await
559 .unwrap();
560
561 let mut doc1 = Document::new();
562 doc1.add_text(title, "rust programming");
563 doc1.add_text(body, "Learn rust language");
564 writer.add_document(doc1).unwrap();
565
566 let mut doc2 = Document::new();
567 doc2.add_text(title, "python programming");
568 doc2.add_text(body, "Learn python language");
569 writer.add_document(doc2).unwrap();
570
571 writer.commit().await.unwrap();
572
573 let index = Index::open(dir, config).await.unwrap();
574
575 let results = index.query("rust", 10).await.unwrap();
577 assert_eq!(results.hits.len(), 1);
578
579 let results = index.query("rust programming", 10).await.unwrap();
581 assert!(!results.hits.is_empty());
582
583 let hit = &results.hits[0];
585 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
586
587 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
589 assert!(
590 !doc.field_values().is_empty(),
591 "Doc should have field values"
592 );
593
594 let doc = index.doc(0).await.unwrap().unwrap();
596 assert!(
597 !doc.field_values().is_empty(),
598 "Doc should have field values"
599 );
600 }
601
602 #[tokio::test]
603 async fn test_slice_cache_warmup_and_load() {
604 use crate::directories::SliceCachingDirectory;
605
606 let mut schema_builder = SchemaBuilder::default();
607 let title = schema_builder.add_text_field("title", true, true);
608 let body = schema_builder.add_text_field("body", true, true);
609 let schema = schema_builder.build();
610
611 let dir = RamDirectory::new();
612 let config = IndexConfig::default();
613
614 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
616 .await
617 .unwrap();
618
619 for i in 0..10 {
620 let mut doc = Document::new();
621 doc.add_text(title, format!("Document {} about rust", i));
622 doc.add_text(body, format!("This is body text number {}", i));
623 writer.add_document(doc).unwrap();
624 }
625 writer.commit().await.unwrap();
626
627 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
629 let index = Index::open(caching_dir, config.clone()).await.unwrap();
630
631 let results = index.query("rust", 10).await.unwrap();
633 assert!(!results.hits.is_empty());
634
635 let stats = index.directory.stats();
637 assert!(stats.total_bytes > 0, "Cache should have data after search");
638 }
639
640 #[tokio::test]
641 async fn test_multivalue_field_indexing_and_search() {
642 let mut schema_builder = SchemaBuilder::default();
643 let uris = schema_builder.add_text_field("uris", true, true);
644 let title = schema_builder.add_text_field("title", true, true);
645 let schema = schema_builder.build();
646
647 let dir = RamDirectory::new();
648 let config = IndexConfig::default();
649
650 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
652 .await
653 .unwrap();
654
655 let mut doc = Document::new();
656 doc.add_text(uris, "one");
657 doc.add_text(uris, "two");
658 doc.add_text(title, "Test Document");
659 writer.add_document(doc).unwrap();
660
661 let mut doc2 = Document::new();
663 doc2.add_text(uris, "three");
664 doc2.add_text(title, "Another Document");
665 writer.add_document(doc2).unwrap();
666
667 writer.commit().await.unwrap();
668
669 let index = Index::open(dir, config).await.unwrap();
671 assert_eq!(index.num_docs().await.unwrap(), 2);
672
673 let doc = index.doc(0).await.unwrap().unwrap();
675 let all_uris: Vec<_> = doc.get_all(uris).collect();
676 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
677 assert_eq!(all_uris[0].as_text(), Some("one"));
678 assert_eq!(all_uris[1].as_text(), Some("two"));
679
680 let json = doc.to_json(index.schema());
682 let uris_json = json.get("uris").unwrap();
683 assert!(uris_json.is_array(), "Multi-value field should be an array");
684 let uris_arr = uris_json.as_array().unwrap();
685 assert_eq!(uris_arr.len(), 2);
686 assert_eq!(uris_arr[0].as_str(), Some("one"));
687 assert_eq!(uris_arr[1].as_str(), Some("two"));
688
689 let results = index.query("uris:one", 10).await.unwrap();
691 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
692 assert_eq!(results.hits[0].address.doc_id, 0);
693
694 let results = index.query("uris:two", 10).await.unwrap();
695 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
696 assert_eq!(results.hits[0].address.doc_id, 0);
697
698 let results = index.query("uris:three", 10).await.unwrap();
699 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
700 assert_eq!(results.hits[0].address.doc_id, 1);
701
702 let results = index.query("uris:nonexistent", 10).await.unwrap();
704 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
705 }
706
707 #[tokio::test]
714 async fn test_wand_optimization_for_or_queries() {
715 use crate::query::{BooleanQuery, TermQuery};
716
717 let mut schema_builder = SchemaBuilder::default();
718 let content = schema_builder.add_text_field("content", true, true);
719 let schema = schema_builder.build();
720
721 let dir = RamDirectory::new();
722 let config = IndexConfig::default();
723
724 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
726 .await
727 .unwrap();
728
729 let mut doc = Document::new();
731 doc.add_text(content, "rust programming language is fast");
732 writer.add_document(doc).unwrap();
733
734 let mut doc = Document::new();
736 doc.add_text(content, "rust is a systems language");
737 writer.add_document(doc).unwrap();
738
739 let mut doc = Document::new();
741 doc.add_text(content, "programming is fun");
742 writer.add_document(doc).unwrap();
743
744 let mut doc = Document::new();
746 doc.add_text(content, "python is easy to learn");
747 writer.add_document(doc).unwrap();
748
749 let mut doc = Document::new();
751 doc.add_text(content, "rust rust programming programming systems");
752 writer.add_document(doc).unwrap();
753
754 writer.commit().await.unwrap();
755
756 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
758
759 let or_query = BooleanQuery::new()
761 .should(TermQuery::text(content, "rust"))
762 .should(TermQuery::text(content, "programming"));
763
764 let results = index.search(&or_query, 10).await.unwrap();
765
766 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
768
769 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
770 assert!(doc_ids.contains(&0), "Should find doc 0");
771 assert!(doc_ids.contains(&1), "Should find doc 1");
772 assert!(doc_ids.contains(&2), "Should find doc 2");
773 assert!(doc_ids.contains(&4), "Should find doc 4");
774 assert!(
775 !doc_ids.contains(&3),
776 "Should NOT find doc 3 (only has 'python')"
777 );
778
779 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
781
782 let results = index.search(&single_query, 10).await.unwrap();
783 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
784
785 let must_query = BooleanQuery::new()
787 .must(TermQuery::text(content, "rust"))
788 .should(TermQuery::text(content, "programming"));
789
790 let results = index.search(&must_query, 10).await.unwrap();
791 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
793
794 let must_not_query = BooleanQuery::new()
796 .should(TermQuery::text(content, "rust"))
797 .should(TermQuery::text(content, "programming"))
798 .must_not(TermQuery::text(content, "systems"));
799
800 let results = index.search(&must_not_query, 10).await.unwrap();
801 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
803 assert!(
804 !doc_ids.contains(&1),
805 "Should NOT find doc 1 (has 'systems')"
806 );
807 assert!(
808 !doc_ids.contains(&4),
809 "Should NOT find doc 4 (has 'systems')"
810 );
811
812 let or_query = BooleanQuery::new()
814 .should(TermQuery::text(content, "rust"))
815 .should(TermQuery::text(content, "programming"));
816
817 let results = index.search(&or_query, 2).await.unwrap();
818 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
819
820 }
823
824 #[tokio::test]
826 async fn test_wand_results_match_standard_boolean() {
827 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
828
829 let mut schema_builder = SchemaBuilder::default();
830 let content = schema_builder.add_text_field("content", true, true);
831 let schema = schema_builder.build();
832
833 let dir = RamDirectory::new();
834 let config = IndexConfig::default();
835
836 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
837 .await
838 .unwrap();
839
840 for i in 0..10 {
842 let mut doc = Document::new();
843 let text = match i % 4 {
844 0 => "apple banana cherry",
845 1 => "apple orange",
846 2 => "banana grape",
847 _ => "cherry date",
848 };
849 doc.add_text(content, text);
850 writer.add_document(doc).unwrap();
851 }
852
853 writer.commit().await.unwrap();
854 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
855
856 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
858
859 let bool_query = BooleanQuery::new()
860 .should(TermQuery::text(content, "apple"))
861 .should(TermQuery::text(content, "banana"));
862
863 let wand_results = index.search(&wand_query, 10).await.unwrap();
864 let bool_results = index.search(&bool_query, 10).await.unwrap();
865
866 assert_eq!(
868 wand_results.hits.len(),
869 bool_results.hits.len(),
870 "WAND and Boolean should find same number of docs"
871 );
872
873 let wand_docs: std::collections::HashSet<u32> =
874 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
875 let bool_docs: std::collections::HashSet<u32> =
876 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
877
878 assert_eq!(
879 wand_docs, bool_docs,
880 "WAND and Boolean should find same documents"
881 );
882 }
883
884 #[tokio::test]
885 async fn test_vector_index_threshold_switch() {
886 use crate::dsl::{DenseVectorConfig, VectorIndexType};
887
888 let mut schema_builder = SchemaBuilder::default();
890 let title = schema_builder.add_text_field("title", true, true);
891 let embedding = schema_builder.add_dense_vector_field_with_config(
892 "embedding",
893 true, true, DenseVectorConfig {
896 dim: 8,
897 index_type: VectorIndexType::IvfRaBitQ,
898 store_raw: true,
899 num_clusters: Some(4), nprobe: 2,
901 mrl_dim: None,
902 build_threshold: Some(50), },
904 );
905 let schema = schema_builder.build();
906
907 let dir = RamDirectory::new();
908 let config = IndexConfig::default();
909
910 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
912 .await
913 .unwrap();
914
915 for i in 0..30 {
917 let mut doc = Document::new();
918 doc.add_text(title, format!("Document {}", i));
919 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
921 doc.add_dense_vector(embedding, vec);
922 writer.add_document(doc).unwrap();
923 }
924 writer.commit().await.unwrap();
925
926 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
928 assert!(
929 index.trained_centroids.is_empty(),
930 "Should not have trained centroids below threshold"
931 );
932
933 let query_vec: Vec<f32> = vec![0.5; 8];
935 let segments = index.segment_readers().await.unwrap();
936 assert!(!segments.is_empty());
937
938 let results = segments[0]
939 .search_dense_vector(
940 embedding,
941 &query_vec,
942 5,
943 1,
944 crate::query::MultiValueCombiner::Max,
945 )
946 .unwrap();
947 assert!(!results.is_empty(), "Flat search should return results");
948
949 let writer = IndexWriter::open(dir.clone(), config.clone())
951 .await
952 .unwrap();
953
954 for i in 30..60 {
956 let mut doc = Document::new();
957 doc.add_text(title, format!("Document {}", i));
958 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
959 doc.add_dense_vector(embedding, vec);
960 writer.add_document(doc).unwrap();
961 }
962 writer.commit().await.unwrap();
964
965 assert!(
967 writer.is_vector_index_built(embedding).await,
968 "Vector index should be built after crossing threshold"
969 );
970
971 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
973 assert!(
974 index.trained_centroids.contains_key(&embedding.0),
975 "Should have loaded trained centroids for embedding field"
976 );
977
978 let segments = index.segment_readers().await.unwrap();
980 let results = segments[0]
981 .search_dense_vector(
982 embedding,
983 &query_vec,
984 5,
985 1,
986 crate::query::MultiValueCombiner::Max,
987 )
988 .unwrap();
989 assert!(
990 !results.is_empty(),
991 "Search should return results after build"
992 );
993
994 let writer = IndexWriter::open(dir.clone(), config.clone())
996 .await
997 .unwrap();
998 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1002 }
1003}