1#[cfg(feature = "native")]
11use crate::dsl::Schema;
12#[cfg(feature = "native")]
13use crate::error::Result;
14#[cfg(feature = "native")]
15use crate::structures::{CoarseCentroids, PQCodebook};
16#[cfg(feature = "native")]
17use rustc_hash::FxHashMap;
18#[cfg(feature = "native")]
19use std::sync::Arc;
20
21mod searcher;
22pub use searcher::Searcher;
23
24#[cfg(feature = "native")]
25mod reader;
26#[cfg(feature = "native")]
27mod vector_builder;
28#[cfg(feature = "native")]
29mod writer;
30#[cfg(feature = "native")]
31pub use reader::IndexReader;
32#[cfg(feature = "native")]
33pub use writer::IndexWriter;
34
35mod metadata;
36pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
37
38#[cfg(feature = "native")]
39mod helpers;
40#[cfg(feature = "native")]
41pub use helpers::{
42 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
43 index_documents_from_reader, index_json_document, parse_schema,
44};
45
46pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
48
49#[derive(Debug, Clone)]
51pub struct IndexConfig {
52 pub num_threads: usize,
54 pub num_indexing_threads: usize,
56 pub num_compression_threads: usize,
58 pub term_cache_blocks: usize,
60 pub store_cache_blocks: usize,
62 pub max_indexing_memory_bytes: usize,
64 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
66 pub optimization: crate::structures::IndexOptimization,
68}
69
70impl Default for IndexConfig {
71 fn default() -> Self {
72 #[cfg(feature = "native")]
73 let cpus = num_cpus::get().max(1);
74 #[cfg(not(feature = "native"))]
75 let cpus = 1;
76
77 Self {
78 num_threads: cpus,
79 num_indexing_threads: 1,
80 num_compression_threads: cpus,
81 term_cache_blocks: 256,
82 store_cache_blocks: 32,
83 max_indexing_memory_bytes: 2 * 1024 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
85 optimization: crate::structures::IndexOptimization::default(),
86 }
87 }
88}
89
90#[cfg(feature = "native")]
99pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
100 directory: Arc<D>,
101 schema: Arc<Schema>,
102 config: IndexConfig,
103 segment_manager: Arc<crate::merge::SegmentManager<D>>,
105 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
107 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
109}
110
111#[cfg(feature = "native")]
112impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
113 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
115 let directory = Arc::new(directory);
116 let schema = Arc::new(schema);
117 let metadata = IndexMetadata::new((*schema).clone());
118
119 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
120 Arc::clone(&directory),
121 Arc::clone(&schema),
122 metadata,
123 config.merge_policy.clone_box(),
124 config.term_cache_blocks,
125 ));
126
127 segment_manager.update_metadata(|_| {}).await?;
129
130 Ok(Self {
131 directory,
132 schema,
133 config,
134 segment_manager,
135 trained_centroids: FxHashMap::default(),
136 trained_codebooks: FxHashMap::default(),
137 })
138 }
139
140 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
142 let directory = Arc::new(directory);
143
144 let metadata = IndexMetadata::load(directory.as_ref()).await?;
146 let schema = Arc::new(metadata.schema.clone());
147
148 let (trained_centroids, trained_codebooks) =
150 metadata.load_trained_structures(directory.as_ref()).await;
151
152 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
153 Arc::clone(&directory),
154 Arc::clone(&schema),
155 metadata,
156 config.merge_policy.clone_box(),
157 config.term_cache_blocks,
158 ));
159
160 Ok(Self {
161 directory,
162 schema,
163 config,
164 segment_manager,
165 trained_centroids,
166 trained_codebooks,
167 })
168 }
169
170 pub fn schema(&self) -> &Schema {
172 &self.schema
173 }
174
175 pub fn directory(&self) -> &D {
177 &self.directory
178 }
179
180 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
182 &self.segment_manager
183 }
184
185 pub async fn reader(&self) -> Result<IndexReader<D>> {
187 IndexReader::from_segment_manager(
188 Arc::clone(&self.schema),
189 Arc::clone(&self.segment_manager),
190 self.trained_centroids.clone(),
191 self.trained_codebooks.clone(),
192 self.config.term_cache_blocks,
193 )
194 .await
195 }
196
197 pub fn config(&self) -> &IndexConfig {
199 &self.config
200 }
201
202 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
204 &self.trained_centroids
205 }
206
207 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
209 &self.trained_codebooks
210 }
211
212 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
214 let reader = self.reader().await?;
215 let searcher = reader.searcher().await?;
216 Ok(searcher.segment_readers().to_vec())
217 }
218
219 pub async fn num_docs(&self) -> Result<u32> {
221 let reader = self.reader().await?;
222 let searcher = reader.searcher().await?;
223 Ok(searcher.num_docs())
224 }
225
226 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
228 let reader = self.reader().await?;
229 let searcher = reader.searcher().await?;
230 searcher.doc(doc_id).await
231 }
232
233 pub fn default_fields(&self) -> Vec<crate::Field> {
235 if !self.schema.default_fields().is_empty() {
236 self.schema.default_fields().to_vec()
237 } else {
238 self.schema
239 .fields()
240 .filter(|(_, entry)| {
241 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
242 })
243 .map(|(field, _)| field)
244 .collect()
245 }
246 }
247
248 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
250 Arc::new(crate::tokenizer::TokenizerRegistry::default())
251 }
252
253 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
255 let default_fields = self.default_fields();
256 let tokenizers = self.tokenizers();
257
258 let query_routers = self.schema.query_routers();
259 if !query_routers.is_empty()
260 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
261 {
262 return crate::dsl::QueryLanguageParser::with_router(
263 Arc::clone(&self.schema),
264 default_fields,
265 tokenizers,
266 router,
267 );
268 }
269
270 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
271 }
272
273 pub async fn query(
275 &self,
276 query_str: &str,
277 limit: usize,
278 ) -> Result<crate::query::SearchResponse> {
279 self.query_offset(query_str, limit, 0).await
280 }
281
282 pub async fn query_offset(
284 &self,
285 query_str: &str,
286 limit: usize,
287 offset: usize,
288 ) -> Result<crate::query::SearchResponse> {
289 let parser = self.query_parser();
290 let query = parser
291 .parse(query_str)
292 .map_err(crate::error::Error::Query)?;
293 self.search_offset(query.as_ref(), limit, offset).await
294 }
295
296 pub async fn search(
298 &self,
299 query: &dyn crate::query::Query,
300 limit: usize,
301 ) -> Result<crate::query::SearchResponse> {
302 self.search_offset(query, limit, 0).await
303 }
304
305 pub async fn search_offset(
307 &self,
308 query: &dyn crate::query::Query,
309 limit: usize,
310 offset: usize,
311 ) -> Result<crate::query::SearchResponse> {
312 let reader = self.reader().await?;
313 let searcher = reader.searcher().await?;
314 let segments = searcher.segment_readers();
315
316 let mut all_results: Vec<(u128, crate::query::SearchResult)> = Vec::new();
317 let fetch_limit = offset + limit;
318
319 for segment in segments {
320 let segment_id = segment.meta().id;
321 let results =
322 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
323 for result in results {
324 all_results.push((segment_id, result));
325 }
326 }
327
328 all_results.sort_by(|a, b| {
329 b.1.score
330 .partial_cmp(&a.1.score)
331 .unwrap_or(std::cmp::Ordering::Equal)
332 });
333
334 let total_hits = all_results.len() as u32;
335
336 let hits: Vec<crate::query::SearchHit> = all_results
337 .into_iter()
338 .skip(offset)
339 .take(limit)
340 .map(|(segment_id, result)| crate::query::SearchHit {
341 address: crate::query::DocAddress::new(segment_id, result.doc_id),
342 score: result.score,
343 matched_fields: result.extract_ordinals(),
344 })
345 .collect();
346
347 Ok(crate::query::SearchResponse { hits, total_hits })
348 }
349
350 pub async fn get_document(
352 &self,
353 address: &crate::query::DocAddress,
354 ) -> Result<Option<crate::dsl::Document>> {
355 let segment_id = address.segment_id_u128().ok_or_else(|| {
356 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
357 })?;
358
359 let reader = self.reader().await?;
360 let searcher = reader.searcher().await?;
361
362 for segment in searcher.segment_readers() {
363 if segment.meta().id == segment_id {
364 return segment.doc(address.doc_id).await;
365 }
366 }
367
368 Ok(None)
369 }
370
371 pub async fn reload(&self) -> Result<()> {
373 Ok(())
375 }
376
377 pub async fn get_postings(
379 &self,
380 field: crate::Field,
381 term: &[u8],
382 ) -> Result<
383 Vec<(
384 Arc<crate::segment::SegmentReader>,
385 crate::structures::BlockPostingList,
386 )>,
387 > {
388 let segments = self.segment_readers().await?;
389 let mut results = Vec::new();
390
391 for segment in segments {
392 if let Some(postings) = segment.get_postings(field, term).await? {
393 results.push((segment, postings));
394 }
395 }
396
397 Ok(results)
398 }
399}
400
401#[cfg(feature = "native")]
403impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
404 pub fn writer(&self) -> writer::IndexWriter<D> {
406 writer::IndexWriter::from_index(self)
407 }
408}
409
410#[cfg(test)]
413mod tests {
414 use super::*;
415 use crate::directories::RamDirectory;
416 use crate::dsl::{Document, SchemaBuilder};
417
418 #[tokio::test]
419 async fn test_index_create_and_search() {
420 let mut schema_builder = SchemaBuilder::default();
421 let title = schema_builder.add_text_field("title", true, true);
422 let body = schema_builder.add_text_field("body", true, true);
423 let schema = schema_builder.build();
424
425 let dir = RamDirectory::new();
426 let config = IndexConfig::default();
427
428 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
430 .await
431 .unwrap();
432
433 let mut doc1 = Document::new();
434 doc1.add_text(title, "Hello World");
435 doc1.add_text(body, "This is the first document");
436 writer.add_document(doc1).unwrap();
437
438 let mut doc2 = Document::new();
439 doc2.add_text(title, "Goodbye World");
440 doc2.add_text(body, "This is the second document");
441 writer.add_document(doc2).unwrap();
442
443 writer.commit().await.unwrap();
444
445 let index = Index::open(dir, config).await.unwrap();
447 assert_eq!(index.num_docs().await.unwrap(), 2);
448
449 let postings = index.get_postings(title, b"world").await.unwrap();
451 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
456 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
457 }
458
459 #[tokio::test]
460 async fn test_multiple_segments() {
461 let mut schema_builder = SchemaBuilder::default();
462 let title = schema_builder.add_text_field("title", true, true);
463 let schema = schema_builder.build();
464
465 let dir = RamDirectory::new();
466 let config = IndexConfig {
467 max_indexing_memory_bytes: 1024, ..Default::default()
469 };
470
471 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
472 .await
473 .unwrap();
474
475 for batch in 0..3 {
477 for i in 0..5 {
478 let mut doc = Document::new();
479 doc.add_text(title, format!("Document {} batch {}", i, batch));
480 writer.add_document(doc).unwrap();
481 }
482 writer.commit().await.unwrap();
483 }
484
485 let index = Index::open(dir, config).await.unwrap();
487 assert_eq!(index.num_docs().await.unwrap(), 15);
488 assert!(
490 index.segment_readers().await.unwrap().len() >= 2,
491 "Expected multiple segments"
492 );
493 }
494
495 #[tokio::test]
496 async fn test_segment_merge() {
497 let mut schema_builder = SchemaBuilder::default();
498 let title = schema_builder.add_text_field("title", true, true);
499 let schema = schema_builder.build();
500
501 let dir = RamDirectory::new();
502 let config = IndexConfig {
503 max_indexing_memory_bytes: 512, ..Default::default()
505 };
506
507 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
508 .await
509 .unwrap();
510
511 for batch in 0..3 {
513 for i in 0..3 {
514 let mut doc = Document::new();
515 doc.add_text(title, format!("Document {} batch {}", i, batch));
516 writer.add_document(doc).unwrap();
517 }
518 writer.flush().await.unwrap();
519 }
520 writer.commit().await.unwrap();
521
522 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
524 assert!(
525 index.segment_readers().await.unwrap().len() >= 2,
526 "Expected multiple segments"
527 );
528
529 let writer = IndexWriter::open(dir.clone(), config.clone())
531 .await
532 .unwrap();
533 writer.force_merge().await.unwrap();
534
535 let index = Index::open(dir, config).await.unwrap();
537 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
538 assert_eq!(index.num_docs().await.unwrap(), 9);
539
540 let mut found_docs = 0;
542 for i in 0..9 {
543 if index.doc(i).await.unwrap().is_some() {
544 found_docs += 1;
545 }
546 }
547 assert_eq!(found_docs, 9);
548 }
549
550 #[tokio::test]
551 async fn test_match_query() {
552 let mut schema_builder = SchemaBuilder::default();
553 let title = schema_builder.add_text_field("title", true, true);
554 let body = schema_builder.add_text_field("body", true, true);
555 let schema = schema_builder.build();
556
557 let dir = RamDirectory::new();
558 let config = IndexConfig::default();
559
560 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
561 .await
562 .unwrap();
563
564 let mut doc1 = Document::new();
565 doc1.add_text(title, "rust programming");
566 doc1.add_text(body, "Learn rust language");
567 writer.add_document(doc1).unwrap();
568
569 let mut doc2 = Document::new();
570 doc2.add_text(title, "python programming");
571 doc2.add_text(body, "Learn python language");
572 writer.add_document(doc2).unwrap();
573
574 writer.commit().await.unwrap();
575
576 let index = Index::open(dir, config).await.unwrap();
577
578 let results = index.query("rust", 10).await.unwrap();
580 assert_eq!(results.hits.len(), 1);
581
582 let results = index.query("rust programming", 10).await.unwrap();
584 assert!(!results.hits.is_empty());
585
586 let hit = &results.hits[0];
588 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
589
590 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
592 assert!(
593 !doc.field_values().is_empty(),
594 "Doc should have field values"
595 );
596
597 let doc = index.doc(0).await.unwrap().unwrap();
599 assert!(
600 !doc.field_values().is_empty(),
601 "Doc should have field values"
602 );
603 }
604
605 #[tokio::test]
606 async fn test_slice_cache_warmup_and_load() {
607 use crate::directories::SliceCachingDirectory;
608
609 let mut schema_builder = SchemaBuilder::default();
610 let title = schema_builder.add_text_field("title", true, true);
611 let body = schema_builder.add_text_field("body", true, true);
612 let schema = schema_builder.build();
613
614 let dir = RamDirectory::new();
615 let config = IndexConfig::default();
616
617 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
619 .await
620 .unwrap();
621
622 for i in 0..10 {
623 let mut doc = Document::new();
624 doc.add_text(title, format!("Document {} about rust", i));
625 doc.add_text(body, format!("This is body text number {}", i));
626 writer.add_document(doc).unwrap();
627 }
628 writer.commit().await.unwrap();
629
630 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
632 let index = Index::open(caching_dir, config.clone()).await.unwrap();
633
634 let results = index.query("rust", 10).await.unwrap();
636 assert!(!results.hits.is_empty());
637
638 let stats = index.directory.stats();
640 assert!(stats.total_bytes > 0, "Cache should have data after search");
641 }
642
643 #[tokio::test]
644 async fn test_multivalue_field_indexing_and_search() {
645 let mut schema_builder = SchemaBuilder::default();
646 let uris = schema_builder.add_text_field("uris", true, true);
647 let title = schema_builder.add_text_field("title", true, true);
648 let schema = schema_builder.build();
649
650 let dir = RamDirectory::new();
651 let config = IndexConfig::default();
652
653 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
655 .await
656 .unwrap();
657
658 let mut doc = Document::new();
659 doc.add_text(uris, "one");
660 doc.add_text(uris, "two");
661 doc.add_text(title, "Test Document");
662 writer.add_document(doc).unwrap();
663
664 let mut doc2 = Document::new();
666 doc2.add_text(uris, "three");
667 doc2.add_text(title, "Another Document");
668 writer.add_document(doc2).unwrap();
669
670 writer.commit().await.unwrap();
671
672 let index = Index::open(dir, config).await.unwrap();
674 assert_eq!(index.num_docs().await.unwrap(), 2);
675
676 let doc = index.doc(0).await.unwrap().unwrap();
678 let all_uris: Vec<_> = doc.get_all(uris).collect();
679 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
680 assert_eq!(all_uris[0].as_text(), Some("one"));
681 assert_eq!(all_uris[1].as_text(), Some("two"));
682
683 let json = doc.to_json(index.schema());
685 let uris_json = json.get("uris").unwrap();
686 assert!(uris_json.is_array(), "Multi-value field should be an array");
687 let uris_arr = uris_json.as_array().unwrap();
688 assert_eq!(uris_arr.len(), 2);
689 assert_eq!(uris_arr[0].as_str(), Some("one"));
690 assert_eq!(uris_arr[1].as_str(), Some("two"));
691
692 let results = index.query("uris:one", 10).await.unwrap();
694 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
695 assert_eq!(results.hits[0].address.doc_id, 0);
696
697 let results = index.query("uris:two", 10).await.unwrap();
698 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
699 assert_eq!(results.hits[0].address.doc_id, 0);
700
701 let results = index.query("uris:three", 10).await.unwrap();
702 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
703 assert_eq!(results.hits[0].address.doc_id, 1);
704
705 let results = index.query("uris:nonexistent", 10).await.unwrap();
707 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
708 }
709
710 #[tokio::test]
717 async fn test_wand_optimization_for_or_queries() {
718 use crate::query::{BooleanQuery, TermQuery};
719
720 let mut schema_builder = SchemaBuilder::default();
721 let content = schema_builder.add_text_field("content", true, true);
722 let schema = schema_builder.build();
723
724 let dir = RamDirectory::new();
725 let config = IndexConfig::default();
726
727 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
729 .await
730 .unwrap();
731
732 let mut doc = Document::new();
734 doc.add_text(content, "rust programming language is fast");
735 writer.add_document(doc).unwrap();
736
737 let mut doc = Document::new();
739 doc.add_text(content, "rust is a systems language");
740 writer.add_document(doc).unwrap();
741
742 let mut doc = Document::new();
744 doc.add_text(content, "programming is fun");
745 writer.add_document(doc).unwrap();
746
747 let mut doc = Document::new();
749 doc.add_text(content, "python is easy to learn");
750 writer.add_document(doc).unwrap();
751
752 let mut doc = Document::new();
754 doc.add_text(content, "rust rust programming programming systems");
755 writer.add_document(doc).unwrap();
756
757 writer.commit().await.unwrap();
758
759 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
761
762 let or_query = BooleanQuery::new()
764 .should(TermQuery::text(content, "rust"))
765 .should(TermQuery::text(content, "programming"));
766
767 let results = index.search(&or_query, 10).await.unwrap();
768
769 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
771
772 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
773 assert!(doc_ids.contains(&0), "Should find doc 0");
774 assert!(doc_ids.contains(&1), "Should find doc 1");
775 assert!(doc_ids.contains(&2), "Should find doc 2");
776 assert!(doc_ids.contains(&4), "Should find doc 4");
777 assert!(
778 !doc_ids.contains(&3),
779 "Should NOT find doc 3 (only has 'python')"
780 );
781
782 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
784
785 let results = index.search(&single_query, 10).await.unwrap();
786 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
787
788 let must_query = BooleanQuery::new()
790 .must(TermQuery::text(content, "rust"))
791 .should(TermQuery::text(content, "programming"));
792
793 let results = index.search(&must_query, 10).await.unwrap();
794 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
796
797 let must_not_query = BooleanQuery::new()
799 .should(TermQuery::text(content, "rust"))
800 .should(TermQuery::text(content, "programming"))
801 .must_not(TermQuery::text(content, "systems"));
802
803 let results = index.search(&must_not_query, 10).await.unwrap();
804 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
806 assert!(
807 !doc_ids.contains(&1),
808 "Should NOT find doc 1 (has 'systems')"
809 );
810 assert!(
811 !doc_ids.contains(&4),
812 "Should NOT find doc 4 (has 'systems')"
813 );
814
815 let or_query = BooleanQuery::new()
817 .should(TermQuery::text(content, "rust"))
818 .should(TermQuery::text(content, "programming"));
819
820 let results = index.search(&or_query, 2).await.unwrap();
821 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
822
823 }
826
827 #[tokio::test]
829 async fn test_wand_results_match_standard_boolean() {
830 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
831
832 let mut schema_builder = SchemaBuilder::default();
833 let content = schema_builder.add_text_field("content", true, true);
834 let schema = schema_builder.build();
835
836 let dir = RamDirectory::new();
837 let config = IndexConfig::default();
838
839 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
840 .await
841 .unwrap();
842
843 for i in 0..10 {
845 let mut doc = Document::new();
846 let text = match i % 4 {
847 0 => "apple banana cherry",
848 1 => "apple orange",
849 2 => "banana grape",
850 _ => "cherry date",
851 };
852 doc.add_text(content, text);
853 writer.add_document(doc).unwrap();
854 }
855
856 writer.commit().await.unwrap();
857 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
858
859 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
861
862 let bool_query = BooleanQuery::new()
863 .should(TermQuery::text(content, "apple"))
864 .should(TermQuery::text(content, "banana"));
865
866 let wand_results = index.search(&wand_query, 10).await.unwrap();
867 let bool_results = index.search(&bool_query, 10).await.unwrap();
868
869 assert_eq!(
871 wand_results.hits.len(),
872 bool_results.hits.len(),
873 "WAND and Boolean should find same number of docs"
874 );
875
876 let wand_docs: std::collections::HashSet<u32> =
877 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
878 let bool_docs: std::collections::HashSet<u32> =
879 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
880
881 assert_eq!(
882 wand_docs, bool_docs,
883 "WAND and Boolean should find same documents"
884 );
885 }
886
887 #[tokio::test]
888 async fn test_vector_index_threshold_switch() {
889 use crate::dsl::{DenseVectorConfig, VectorIndexType};
890
891 let mut schema_builder = SchemaBuilder::default();
893 let title = schema_builder.add_text_field("title", true, true);
894 let embedding = schema_builder.add_dense_vector_field_with_config(
895 "embedding",
896 true, true, DenseVectorConfig {
899 dim: 8,
900 index_type: VectorIndexType::IvfRaBitQ,
901 store_raw: true,
902 num_clusters: Some(4), nprobe: 2,
904 mrl_dim: None,
905 build_threshold: Some(50), },
907 );
908 let schema = schema_builder.build();
909
910 let dir = RamDirectory::new();
911 let config = IndexConfig::default();
912
913 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
915 .await
916 .unwrap();
917
918 for i in 0..30 {
920 let mut doc = Document::new();
921 doc.add_text(title, format!("Document {}", i));
922 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
924 doc.add_dense_vector(embedding, vec);
925 writer.add_document(doc).unwrap();
926 }
927 writer.commit().await.unwrap();
928
929 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
931 assert!(
932 index.trained_centroids.is_empty(),
933 "Should not have trained centroids below threshold"
934 );
935
936 let query_vec: Vec<f32> = vec![0.5; 8];
938 let segments = index.segment_readers().await.unwrap();
939 assert!(!segments.is_empty());
940
941 let results = segments[0]
942 .search_dense_vector(
943 embedding,
944 &query_vec,
945 5,
946 1,
947 crate::query::MultiValueCombiner::Max,
948 )
949 .unwrap();
950 assert!(!results.is_empty(), "Flat search should return results");
951
952 let writer = IndexWriter::open(dir.clone(), config.clone())
954 .await
955 .unwrap();
956
957 for i in 30..60 {
959 let mut doc = Document::new();
960 doc.add_text(title, format!("Document {}", i));
961 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
962 doc.add_dense_vector(embedding, vec);
963 writer.add_document(doc).unwrap();
964 }
965 writer.commit().await.unwrap();
967
968 assert!(
970 writer.is_vector_index_built(embedding).await,
971 "Vector index should be built after crossing threshold"
972 );
973
974 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
976 assert!(
977 index.trained_centroids.contains_key(&embedding.0),
978 "Should have loaded trained centroids for embedding field"
979 );
980
981 let segments = index.segment_readers().await.unwrap();
983 let results = segments[0]
984 .search_dense_vector(
985 embedding,
986 &query_vec,
987 5,
988 1,
989 crate::query::MultiValueCombiner::Max,
990 )
991 .unwrap();
992 assert!(
993 !results.is_empty(),
994 "Search should return results after build"
995 );
996
997 let writer = IndexWriter::open(dir.clone(), config.clone())
999 .await
1000 .unwrap();
1001 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1005 }
1006}