1#[cfg(feature = "native")]
11use crate::dsl::Schema;
12#[cfg(feature = "native")]
13use crate::error::Result;
14#[cfg(feature = "native")]
15use crate::structures::{CoarseCentroids, PQCodebook};
16#[cfg(feature = "native")]
17use rustc_hash::FxHashMap;
18#[cfg(feature = "native")]
19use std::sync::Arc;
20
21mod searcher;
22pub use searcher::Searcher;
23
24#[cfg(feature = "native")]
25mod reader;
26#[cfg(feature = "native")]
27mod vector_builder;
28#[cfg(feature = "native")]
29mod writer;
30#[cfg(feature = "native")]
31pub use reader::IndexReader;
32#[cfg(feature = "native")]
33pub use writer::IndexWriter;
34
35mod metadata;
36pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
37
38#[cfg(feature = "native")]
39mod helpers;
40#[cfg(feature = "native")]
41pub use helpers::{
42 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
43 index_documents_from_reader, index_json_document, parse_schema,
44};
45
46pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
48
49#[derive(Debug, Clone)]
51pub struct IndexConfig {
52 pub num_threads: usize,
54 pub num_indexing_threads: usize,
56 pub num_compression_threads: usize,
58 pub term_cache_blocks: usize,
60 pub store_cache_blocks: usize,
62 pub max_indexing_memory_bytes: usize,
64 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
66 pub optimization: crate::structures::IndexOptimization,
68}
69
70impl Default for IndexConfig {
71 fn default() -> Self {
72 #[cfg(feature = "native")]
73 let cpus = num_cpus::get().max(1);
74 #[cfg(not(feature = "native"))]
75 let cpus = 1;
76
77 Self {
78 num_threads: cpus,
79 num_indexing_threads: 1,
80 num_compression_threads: cpus,
81 term_cache_blocks: 256,
82 store_cache_blocks: 32,
83 max_indexing_memory_bytes: 2 * 1024 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
85 optimization: crate::structures::IndexOptimization::default(),
86 }
87 }
88}
89
90#[cfg(feature = "native")]
99pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
100 directory: Arc<D>,
101 schema: Arc<Schema>,
102 config: IndexConfig,
103 segment_manager: Arc<crate::merge::SegmentManager<D>>,
105 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
107 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
109 cached_reader: tokio::sync::OnceCell<IndexReader<D>>,
111}
112
113#[cfg(feature = "native")]
114impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
115 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
117 let directory = Arc::new(directory);
118 let schema = Arc::new(schema);
119 let metadata = IndexMetadata::new((*schema).clone());
120
121 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
122 Arc::clone(&directory),
123 Arc::clone(&schema),
124 metadata,
125 config.merge_policy.clone_box(),
126 config.term_cache_blocks,
127 ));
128
129 segment_manager.update_metadata(|_| {}).await?;
131
132 Ok(Self {
133 directory,
134 schema,
135 config,
136 segment_manager,
137 trained_centroids: FxHashMap::default(),
138 trained_codebooks: FxHashMap::default(),
139 cached_reader: tokio::sync::OnceCell::new(),
140 })
141 }
142
143 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
145 let directory = Arc::new(directory);
146
147 let metadata = IndexMetadata::load(directory.as_ref()).await?;
149 let schema = Arc::new(metadata.schema.clone());
150
151 let (trained_centroids, trained_codebooks) =
153 metadata.load_trained_structures(directory.as_ref()).await;
154
155 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
156 Arc::clone(&directory),
157 Arc::clone(&schema),
158 metadata,
159 config.merge_policy.clone_box(),
160 config.term_cache_blocks,
161 ));
162
163 Ok(Self {
164 directory,
165 schema,
166 config,
167 segment_manager,
168 trained_centroids,
169 trained_codebooks,
170 cached_reader: tokio::sync::OnceCell::new(),
171 })
172 }
173
174 pub fn schema(&self) -> &Schema {
176 &self.schema
177 }
178
179 pub fn directory(&self) -> &D {
181 &self.directory
182 }
183
184 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
186 &self.segment_manager
187 }
188
189 pub async fn reader(&self) -> Result<&IndexReader<D>> {
194 self.cached_reader
195 .get_or_try_init(|| async {
196 IndexReader::from_segment_manager(
197 Arc::clone(&self.schema),
198 Arc::clone(&self.segment_manager),
199 self.trained_centroids.clone(),
200 self.trained_codebooks.clone(),
201 self.config.term_cache_blocks,
202 )
203 .await
204 })
205 .await
206 }
207
208 pub fn config(&self) -> &IndexConfig {
210 &self.config
211 }
212
213 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
215 &self.trained_centroids
216 }
217
218 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
220 &self.trained_codebooks
221 }
222
223 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
225 let reader = self.reader().await?;
226 let searcher = reader.searcher().await?;
227 Ok(searcher.segment_readers().to_vec())
228 }
229
230 pub async fn num_docs(&self) -> Result<u32> {
232 let reader = self.reader().await?;
233 let searcher = reader.searcher().await?;
234 Ok(searcher.num_docs())
235 }
236
237 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
239 let reader = self.reader().await?;
240 let searcher = reader.searcher().await?;
241 searcher.doc(doc_id).await
242 }
243
244 pub fn default_fields(&self) -> Vec<crate::Field> {
246 if !self.schema.default_fields().is_empty() {
247 self.schema.default_fields().to_vec()
248 } else {
249 self.schema
250 .fields()
251 .filter(|(_, entry)| {
252 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
253 })
254 .map(|(field, _)| field)
255 .collect()
256 }
257 }
258
259 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
261 Arc::new(crate::tokenizer::TokenizerRegistry::default())
262 }
263
264 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
266 let default_fields = self.default_fields();
267 let tokenizers = self.tokenizers();
268
269 let query_routers = self.schema.query_routers();
270 if !query_routers.is_empty()
271 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
272 {
273 return crate::dsl::QueryLanguageParser::with_router(
274 Arc::clone(&self.schema),
275 default_fields,
276 tokenizers,
277 router,
278 );
279 }
280
281 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
282 }
283
284 pub async fn query(
286 &self,
287 query_str: &str,
288 limit: usize,
289 ) -> Result<crate::query::SearchResponse> {
290 self.query_offset(query_str, limit, 0).await
291 }
292
293 pub async fn query_offset(
295 &self,
296 query_str: &str,
297 limit: usize,
298 offset: usize,
299 ) -> Result<crate::query::SearchResponse> {
300 let parser = self.query_parser();
301 let query = parser
302 .parse(query_str)
303 .map_err(crate::error::Error::Query)?;
304 self.search_offset(query.as_ref(), limit, offset).await
305 }
306
307 pub async fn search(
309 &self,
310 query: &dyn crate::query::Query,
311 limit: usize,
312 ) -> Result<crate::query::SearchResponse> {
313 self.search_offset(query, limit, 0).await
314 }
315
316 pub async fn search_offset(
318 &self,
319 query: &dyn crate::query::Query,
320 limit: usize,
321 offset: usize,
322 ) -> Result<crate::query::SearchResponse> {
323 let reader = self.reader().await?;
324 let searcher = reader.searcher().await?;
325 let segments = searcher.segment_readers();
326
327 let mut all_results: Vec<(u128, crate::query::SearchResult)> = Vec::new();
328 let fetch_limit = offset + limit;
329
330 for segment in segments {
331 let segment_id = segment.meta().id;
332 let results =
333 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
334 for result in results {
335 all_results.push((segment_id, result));
336 }
337 }
338
339 all_results.sort_by(|a, b| {
340 b.1.score
341 .partial_cmp(&a.1.score)
342 .unwrap_or(std::cmp::Ordering::Equal)
343 });
344
345 let total_hits = all_results.len() as u32;
346
347 let hits: Vec<crate::query::SearchHit> = all_results
348 .into_iter()
349 .skip(offset)
350 .take(limit)
351 .map(|(segment_id, result)| crate::query::SearchHit {
352 address: crate::query::DocAddress::new(segment_id, result.doc_id),
353 score: result.score,
354 matched_fields: result.extract_ordinals(),
355 })
356 .collect();
357
358 Ok(crate::query::SearchResponse { hits, total_hits })
359 }
360
361 pub async fn get_document(
363 &self,
364 address: &crate::query::DocAddress,
365 ) -> Result<Option<crate::dsl::Document>> {
366 let segment_id = address.segment_id_u128().ok_or_else(|| {
367 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
368 })?;
369
370 let reader = self.reader().await?;
371 let searcher = reader.searcher().await?;
372
373 for segment in searcher.segment_readers() {
374 if segment.meta().id == segment_id {
375 return segment.doc(address.doc_id).await;
376 }
377 }
378
379 Ok(None)
380 }
381
382 pub async fn reload(&self) -> Result<()> {
384 Ok(())
386 }
387
388 pub async fn get_postings(
390 &self,
391 field: crate::Field,
392 term: &[u8],
393 ) -> Result<
394 Vec<(
395 Arc<crate::segment::SegmentReader>,
396 crate::structures::BlockPostingList,
397 )>,
398 > {
399 let segments = self.segment_readers().await?;
400 let mut results = Vec::new();
401
402 for segment in segments {
403 if let Some(postings) = segment.get_postings(field, term).await? {
404 results.push((segment, postings));
405 }
406 }
407
408 Ok(results)
409 }
410}
411
412#[cfg(feature = "native")]
414impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
415 pub fn writer(&self) -> writer::IndexWriter<D> {
417 writer::IndexWriter::from_index(self)
418 }
419}
420
421#[cfg(test)]
424mod tests {
425 use super::*;
426 use crate::directories::RamDirectory;
427 use crate::dsl::{Document, SchemaBuilder};
428
429 #[tokio::test]
430 async fn test_index_create_and_search() {
431 let mut schema_builder = SchemaBuilder::default();
432 let title = schema_builder.add_text_field("title", true, true);
433 let body = schema_builder.add_text_field("body", true, true);
434 let schema = schema_builder.build();
435
436 let dir = RamDirectory::new();
437 let config = IndexConfig::default();
438
439 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
441 .await
442 .unwrap();
443
444 let mut doc1 = Document::new();
445 doc1.add_text(title, "Hello World");
446 doc1.add_text(body, "This is the first document");
447 writer.add_document(doc1).unwrap();
448
449 let mut doc2 = Document::new();
450 doc2.add_text(title, "Goodbye World");
451 doc2.add_text(body, "This is the second document");
452 writer.add_document(doc2).unwrap();
453
454 writer.commit().await.unwrap();
455
456 let index = Index::open(dir, config).await.unwrap();
458 assert_eq!(index.num_docs().await.unwrap(), 2);
459
460 let postings = index.get_postings(title, b"world").await.unwrap();
462 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
467 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
468 }
469
470 #[tokio::test]
471 async fn test_multiple_segments() {
472 let mut schema_builder = SchemaBuilder::default();
473 let title = schema_builder.add_text_field("title", true, true);
474 let schema = schema_builder.build();
475
476 let dir = RamDirectory::new();
477 let config = IndexConfig {
478 max_indexing_memory_bytes: 1024, ..Default::default()
480 };
481
482 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
483 .await
484 .unwrap();
485
486 for batch in 0..3 {
488 for i in 0..5 {
489 let mut doc = Document::new();
490 doc.add_text(title, format!("Document {} batch {}", i, batch));
491 writer.add_document(doc).unwrap();
492 }
493 writer.commit().await.unwrap();
494 }
495
496 let index = Index::open(dir, config).await.unwrap();
498 assert_eq!(index.num_docs().await.unwrap(), 15);
499 assert!(
501 index.segment_readers().await.unwrap().len() >= 2,
502 "Expected multiple segments"
503 );
504 }
505
506 #[tokio::test]
507 async fn test_segment_merge() {
508 let mut schema_builder = SchemaBuilder::default();
509 let title = schema_builder.add_text_field("title", true, true);
510 let schema = schema_builder.build();
511
512 let dir = RamDirectory::new();
513 let config = IndexConfig {
514 max_indexing_memory_bytes: 512, ..Default::default()
516 };
517
518 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
519 .await
520 .unwrap();
521
522 for batch in 0..3 {
524 for i in 0..3 {
525 let mut doc = Document::new();
526 doc.add_text(title, format!("Document {} batch {}", i, batch));
527 writer.add_document(doc).unwrap();
528 }
529 writer.flush().await.unwrap();
530 }
531 writer.commit().await.unwrap();
532
533 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
535 assert!(
536 index.segment_readers().await.unwrap().len() >= 2,
537 "Expected multiple segments"
538 );
539
540 let writer = IndexWriter::open(dir.clone(), config.clone())
542 .await
543 .unwrap();
544 writer.force_merge().await.unwrap();
545
546 let index = Index::open(dir, config).await.unwrap();
548 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
549 assert_eq!(index.num_docs().await.unwrap(), 9);
550
551 let mut found_docs = 0;
553 for i in 0..9 {
554 if index.doc(i).await.unwrap().is_some() {
555 found_docs += 1;
556 }
557 }
558 assert_eq!(found_docs, 9);
559 }
560
561 #[tokio::test]
562 async fn test_match_query() {
563 let mut schema_builder = SchemaBuilder::default();
564 let title = schema_builder.add_text_field("title", true, true);
565 let body = schema_builder.add_text_field("body", true, true);
566 let schema = schema_builder.build();
567
568 let dir = RamDirectory::new();
569 let config = IndexConfig::default();
570
571 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
572 .await
573 .unwrap();
574
575 let mut doc1 = Document::new();
576 doc1.add_text(title, "rust programming");
577 doc1.add_text(body, "Learn rust language");
578 writer.add_document(doc1).unwrap();
579
580 let mut doc2 = Document::new();
581 doc2.add_text(title, "python programming");
582 doc2.add_text(body, "Learn python language");
583 writer.add_document(doc2).unwrap();
584
585 writer.commit().await.unwrap();
586
587 let index = Index::open(dir, config).await.unwrap();
588
589 let results = index.query("rust", 10).await.unwrap();
591 assert_eq!(results.hits.len(), 1);
592
593 let results = index.query("rust programming", 10).await.unwrap();
595 assert!(!results.hits.is_empty());
596
597 let hit = &results.hits[0];
599 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
600
601 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
603 assert!(
604 !doc.field_values().is_empty(),
605 "Doc should have field values"
606 );
607
608 let doc = index.doc(0).await.unwrap().unwrap();
610 assert!(
611 !doc.field_values().is_empty(),
612 "Doc should have field values"
613 );
614 }
615
616 #[tokio::test]
617 async fn test_slice_cache_warmup_and_load() {
618 use crate::directories::SliceCachingDirectory;
619
620 let mut schema_builder = SchemaBuilder::default();
621 let title = schema_builder.add_text_field("title", true, true);
622 let body = schema_builder.add_text_field("body", true, true);
623 let schema = schema_builder.build();
624
625 let dir = RamDirectory::new();
626 let config = IndexConfig::default();
627
628 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
630 .await
631 .unwrap();
632
633 for i in 0..10 {
634 let mut doc = Document::new();
635 doc.add_text(title, format!("Document {} about rust", i));
636 doc.add_text(body, format!("This is body text number {}", i));
637 writer.add_document(doc).unwrap();
638 }
639 writer.commit().await.unwrap();
640
641 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
643 let index = Index::open(caching_dir, config.clone()).await.unwrap();
644
645 let results = index.query("rust", 10).await.unwrap();
647 assert!(!results.hits.is_empty());
648
649 let stats = index.directory.stats();
651 assert!(stats.total_bytes > 0, "Cache should have data after search");
652 }
653
654 #[tokio::test]
655 async fn test_multivalue_field_indexing_and_search() {
656 let mut schema_builder = SchemaBuilder::default();
657 let uris = schema_builder.add_text_field("uris", true, true);
658 let title = schema_builder.add_text_field("title", true, true);
659 let schema = schema_builder.build();
660
661 let dir = RamDirectory::new();
662 let config = IndexConfig::default();
663
664 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
666 .await
667 .unwrap();
668
669 let mut doc = Document::new();
670 doc.add_text(uris, "one");
671 doc.add_text(uris, "two");
672 doc.add_text(title, "Test Document");
673 writer.add_document(doc).unwrap();
674
675 let mut doc2 = Document::new();
677 doc2.add_text(uris, "three");
678 doc2.add_text(title, "Another Document");
679 writer.add_document(doc2).unwrap();
680
681 writer.commit().await.unwrap();
682
683 let index = Index::open(dir, config).await.unwrap();
685 assert_eq!(index.num_docs().await.unwrap(), 2);
686
687 let doc = index.doc(0).await.unwrap().unwrap();
689 let all_uris: Vec<_> = doc.get_all(uris).collect();
690 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
691 assert_eq!(all_uris[0].as_text(), Some("one"));
692 assert_eq!(all_uris[1].as_text(), Some("two"));
693
694 let json = doc.to_json(index.schema());
696 let uris_json = json.get("uris").unwrap();
697 assert!(uris_json.is_array(), "Multi-value field should be an array");
698 let uris_arr = uris_json.as_array().unwrap();
699 assert_eq!(uris_arr.len(), 2);
700 assert_eq!(uris_arr[0].as_str(), Some("one"));
701 assert_eq!(uris_arr[1].as_str(), Some("two"));
702
703 let results = index.query("uris:one", 10).await.unwrap();
705 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
706 assert_eq!(results.hits[0].address.doc_id, 0);
707
708 let results = index.query("uris:two", 10).await.unwrap();
709 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
710 assert_eq!(results.hits[0].address.doc_id, 0);
711
712 let results = index.query("uris:three", 10).await.unwrap();
713 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
714 assert_eq!(results.hits[0].address.doc_id, 1);
715
716 let results = index.query("uris:nonexistent", 10).await.unwrap();
718 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
719 }
720
721 #[tokio::test]
728 async fn test_wand_optimization_for_or_queries() {
729 use crate::query::{BooleanQuery, TermQuery};
730
731 let mut schema_builder = SchemaBuilder::default();
732 let content = schema_builder.add_text_field("content", true, true);
733 let schema = schema_builder.build();
734
735 let dir = RamDirectory::new();
736 let config = IndexConfig::default();
737
738 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
740 .await
741 .unwrap();
742
743 let mut doc = Document::new();
745 doc.add_text(content, "rust programming language is fast");
746 writer.add_document(doc).unwrap();
747
748 let mut doc = Document::new();
750 doc.add_text(content, "rust is a systems language");
751 writer.add_document(doc).unwrap();
752
753 let mut doc = Document::new();
755 doc.add_text(content, "programming is fun");
756 writer.add_document(doc).unwrap();
757
758 let mut doc = Document::new();
760 doc.add_text(content, "python is easy to learn");
761 writer.add_document(doc).unwrap();
762
763 let mut doc = Document::new();
765 doc.add_text(content, "rust rust programming programming systems");
766 writer.add_document(doc).unwrap();
767
768 writer.commit().await.unwrap();
769
770 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
772
773 let or_query = BooleanQuery::new()
775 .should(TermQuery::text(content, "rust"))
776 .should(TermQuery::text(content, "programming"));
777
778 let results = index.search(&or_query, 10).await.unwrap();
779
780 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
782
783 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
784 assert!(doc_ids.contains(&0), "Should find doc 0");
785 assert!(doc_ids.contains(&1), "Should find doc 1");
786 assert!(doc_ids.contains(&2), "Should find doc 2");
787 assert!(doc_ids.contains(&4), "Should find doc 4");
788 assert!(
789 !doc_ids.contains(&3),
790 "Should NOT find doc 3 (only has 'python')"
791 );
792
793 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
795
796 let results = index.search(&single_query, 10).await.unwrap();
797 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
798
799 let must_query = BooleanQuery::new()
801 .must(TermQuery::text(content, "rust"))
802 .should(TermQuery::text(content, "programming"));
803
804 let results = index.search(&must_query, 10).await.unwrap();
805 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
807
808 let must_not_query = BooleanQuery::new()
810 .should(TermQuery::text(content, "rust"))
811 .should(TermQuery::text(content, "programming"))
812 .must_not(TermQuery::text(content, "systems"));
813
814 let results = index.search(&must_not_query, 10).await.unwrap();
815 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
817 assert!(
818 !doc_ids.contains(&1),
819 "Should NOT find doc 1 (has 'systems')"
820 );
821 assert!(
822 !doc_ids.contains(&4),
823 "Should NOT find doc 4 (has 'systems')"
824 );
825
826 let or_query = BooleanQuery::new()
828 .should(TermQuery::text(content, "rust"))
829 .should(TermQuery::text(content, "programming"));
830
831 let results = index.search(&or_query, 2).await.unwrap();
832 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
833
834 }
837
838 #[tokio::test]
840 async fn test_wand_results_match_standard_boolean() {
841 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
842
843 let mut schema_builder = SchemaBuilder::default();
844 let content = schema_builder.add_text_field("content", true, true);
845 let schema = schema_builder.build();
846
847 let dir = RamDirectory::new();
848 let config = IndexConfig::default();
849
850 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
851 .await
852 .unwrap();
853
854 for i in 0..10 {
856 let mut doc = Document::new();
857 let text = match i % 4 {
858 0 => "apple banana cherry",
859 1 => "apple orange",
860 2 => "banana grape",
861 _ => "cherry date",
862 };
863 doc.add_text(content, text);
864 writer.add_document(doc).unwrap();
865 }
866
867 writer.commit().await.unwrap();
868 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
869
870 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
872
873 let bool_query = BooleanQuery::new()
874 .should(TermQuery::text(content, "apple"))
875 .should(TermQuery::text(content, "banana"));
876
877 let wand_results = index.search(&wand_query, 10).await.unwrap();
878 let bool_results = index.search(&bool_query, 10).await.unwrap();
879
880 assert_eq!(
882 wand_results.hits.len(),
883 bool_results.hits.len(),
884 "WAND and Boolean should find same number of docs"
885 );
886
887 let wand_docs: std::collections::HashSet<u32> =
888 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
889 let bool_docs: std::collections::HashSet<u32> =
890 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
891
892 assert_eq!(
893 wand_docs, bool_docs,
894 "WAND and Boolean should find same documents"
895 );
896 }
897
898 #[tokio::test]
899 async fn test_vector_index_threshold_switch() {
900 use crate::dsl::{DenseVectorConfig, VectorIndexType};
901
902 let mut schema_builder = SchemaBuilder::default();
904 let title = schema_builder.add_text_field("title", true, true);
905 let embedding = schema_builder.add_dense_vector_field_with_config(
906 "embedding",
907 true, true, DenseVectorConfig {
910 dim: 8,
911 index_type: VectorIndexType::IvfRaBitQ,
912 store_raw: true,
913 num_clusters: Some(4), nprobe: 2,
915 mrl_dim: None,
916 build_threshold: Some(50), },
918 );
919 let schema = schema_builder.build();
920
921 let dir = RamDirectory::new();
922 let config = IndexConfig::default();
923
924 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
926 .await
927 .unwrap();
928
929 for i in 0..30 {
931 let mut doc = Document::new();
932 doc.add_text(title, format!("Document {}", i));
933 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
935 doc.add_dense_vector(embedding, vec);
936 writer.add_document(doc).unwrap();
937 }
938 writer.commit().await.unwrap();
939
940 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
942 assert!(
943 index.trained_centroids.is_empty(),
944 "Should not have trained centroids below threshold"
945 );
946
947 let query_vec: Vec<f32> = vec![0.5; 8];
949 let segments = index.segment_readers().await.unwrap();
950 assert!(!segments.is_empty());
951
952 let results = segments[0]
953 .search_dense_vector(
954 embedding,
955 &query_vec,
956 5,
957 1,
958 crate::query::MultiValueCombiner::Max,
959 )
960 .unwrap();
961 assert!(!results.is_empty(), "Flat search should return results");
962
963 let writer = IndexWriter::open(dir.clone(), config.clone())
965 .await
966 .unwrap();
967
968 for i in 30..60 {
970 let mut doc = Document::new();
971 doc.add_text(title, format!("Document {}", i));
972 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
973 doc.add_dense_vector(embedding, vec);
974 writer.add_document(doc).unwrap();
975 }
976 writer.commit().await.unwrap();
978
979 assert!(
981 writer.is_vector_index_built(embedding).await,
982 "Vector index should be built after crossing threshold"
983 );
984
985 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
987 assert!(
988 index.trained_centroids.contains_key(&embedding.0),
989 "Should have loaded trained centroids for embedding field"
990 );
991
992 let segments = index.segment_readers().await.unwrap();
994 let results = segments[0]
995 .search_dense_vector(
996 embedding,
997 &query_vec,
998 5,
999 1,
1000 crate::query::MultiValueCombiner::Max,
1001 )
1002 .unwrap();
1003 assert!(
1004 !results.is_empty(),
1005 "Search should return results after build"
1006 );
1007
1008 let writer = IndexWriter::open(dir.clone(), config.clone())
1010 .await
1011 .unwrap();
1012 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1016 }
1017}