1#[cfg(feature = "native")]
11use crate::dsl::Schema;
12#[cfg(feature = "native")]
13use crate::error::Result;
14#[cfg(feature = "native")]
15use crate::structures::{CoarseCentroids, PQCodebook};
16#[cfg(feature = "native")]
17use rustc_hash::FxHashMap;
18#[cfg(feature = "native")]
19use std::sync::Arc;
20
21mod searcher;
22pub use searcher::Searcher;
23
24#[cfg(feature = "native")]
25mod reader;
26#[cfg(feature = "native")]
27mod vector_builder;
28#[cfg(feature = "native")]
29mod writer;
30#[cfg(feature = "native")]
31pub use reader::IndexReader;
32#[cfg(feature = "native")]
33pub use writer::IndexWriter;
34
35mod metadata;
36pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
37
38#[cfg(feature = "native")]
39mod helpers;
40#[cfg(feature = "native")]
41pub use helpers::{
42 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
43 index_documents_from_reader, index_json_document, parse_schema,
44};
45
46pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
48
49#[derive(Debug, Clone)]
51pub struct IndexConfig {
52 pub num_threads: usize,
54 pub num_indexing_threads: usize,
56 pub num_compression_threads: usize,
58 pub term_cache_blocks: usize,
60 pub store_cache_blocks: usize,
62 pub max_indexing_memory_bytes: usize,
64 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
66 pub optimization: crate::structures::IndexOptimization,
68 pub reload_interval_ms: u64,
70}
71
72impl Default for IndexConfig {
73 fn default() -> Self {
74 #[cfg(feature = "native")]
75 let cpus = num_cpus::get().max(1);
76 #[cfg(not(feature = "native"))]
77 let cpus = 1;
78
79 Self {
80 num_threads: cpus,
81 num_indexing_threads: 1,
82 num_compression_threads: cpus,
83 term_cache_blocks: 256,
84 store_cache_blocks: 32,
85 max_indexing_memory_bytes: 256 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
87 optimization: crate::structures::IndexOptimization::default(),
88 reload_interval_ms: 1000, }
90 }
91}
92
93#[cfg(feature = "native")]
102pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
103 directory: Arc<D>,
104 schema: Arc<Schema>,
105 config: IndexConfig,
106 segment_manager: Arc<crate::merge::SegmentManager<D>>,
108 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
110 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
112 cached_reader: tokio::sync::OnceCell<IndexReader<D>>,
114}
115
116#[cfg(feature = "native")]
117impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
118 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
120 let directory = Arc::new(directory);
121 let schema = Arc::new(schema);
122 let metadata = IndexMetadata::new((*schema).clone());
123
124 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
125 Arc::clone(&directory),
126 Arc::clone(&schema),
127 metadata,
128 config.merge_policy.clone_box(),
129 config.term_cache_blocks,
130 ));
131
132 segment_manager.update_metadata(|_| {}).await?;
134
135 Ok(Self {
136 directory,
137 schema,
138 config,
139 segment_manager,
140 trained_centroids: FxHashMap::default(),
141 trained_codebooks: FxHashMap::default(),
142 cached_reader: tokio::sync::OnceCell::new(),
143 })
144 }
145
146 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
148 let directory = Arc::new(directory);
149
150 let metadata = IndexMetadata::load(directory.as_ref()).await?;
152 let schema = Arc::new(metadata.schema.clone());
153
154 let trained = metadata.load_trained_structures(directory.as_ref()).await;
156 let trained_centroids = trained
157 .as_ref()
158 .map(|t| t.centroids.clone())
159 .unwrap_or_default();
160 let trained_codebooks = trained
161 .as_ref()
162 .map(|t| t.codebooks.clone())
163 .unwrap_or_default();
164
165 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
166 Arc::clone(&directory),
167 Arc::clone(&schema),
168 metadata,
169 config.merge_policy.clone_box(),
170 config.term_cache_blocks,
171 ));
172
173 Ok(Self {
174 directory,
175 schema,
176 config,
177 segment_manager,
178 trained_centroids,
179 trained_codebooks,
180 cached_reader: tokio::sync::OnceCell::new(),
181 })
182 }
183
184 pub fn schema(&self) -> &Schema {
186 &self.schema
187 }
188
189 pub fn directory(&self) -> &D {
191 &self.directory
192 }
193
194 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
196 &self.segment_manager
197 }
198
199 pub async fn reader(&self) -> Result<&IndexReader<D>> {
204 self.cached_reader
205 .get_or_try_init(|| async {
206 IndexReader::from_segment_manager_with_reload_interval(
207 Arc::clone(&self.schema),
208 Arc::clone(&self.segment_manager),
209 self.trained_centroids.clone(),
210 self.config.term_cache_blocks,
211 self.config.reload_interval_ms,
212 )
213 .await
214 })
215 .await
216 }
217
218 pub fn config(&self) -> &IndexConfig {
220 &self.config
221 }
222
223 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
225 &self.trained_centroids
226 }
227
228 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
230 &self.trained_codebooks
231 }
232
233 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
235 let reader = self.reader().await?;
236 let searcher = reader.searcher().await?;
237 Ok(searcher.segment_readers().to_vec())
238 }
239
240 pub async fn num_docs(&self) -> Result<u32> {
242 let reader = self.reader().await?;
243 let searcher = reader.searcher().await?;
244 Ok(searcher.num_docs())
245 }
246
247 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
249 let reader = self.reader().await?;
250 let searcher = reader.searcher().await?;
251 searcher.doc(doc_id).await
252 }
253
254 pub fn default_fields(&self) -> Vec<crate::Field> {
256 if !self.schema.default_fields().is_empty() {
257 self.schema.default_fields().to_vec()
258 } else {
259 self.schema
260 .fields()
261 .filter(|(_, entry)| {
262 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
263 })
264 .map(|(field, _)| field)
265 .collect()
266 }
267 }
268
269 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
271 Arc::new(crate::tokenizer::TokenizerRegistry::default())
272 }
273
274 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
276 let default_fields = self.default_fields();
277 let tokenizers = self.tokenizers();
278
279 let query_routers = self.schema.query_routers();
280 if !query_routers.is_empty()
281 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
282 {
283 return crate::dsl::QueryLanguageParser::with_router(
284 Arc::clone(&self.schema),
285 default_fields,
286 tokenizers,
287 router,
288 );
289 }
290
291 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
292 }
293
294 pub async fn query(
296 &self,
297 query_str: &str,
298 limit: usize,
299 ) -> Result<crate::query::SearchResponse> {
300 self.query_offset(query_str, limit, 0).await
301 }
302
303 pub async fn query_offset(
305 &self,
306 query_str: &str,
307 limit: usize,
308 offset: usize,
309 ) -> Result<crate::query::SearchResponse> {
310 let parser = self.query_parser();
311 let query = parser
312 .parse(query_str)
313 .map_err(crate::error::Error::Query)?;
314 self.search_offset(query.as_ref(), limit, offset).await
315 }
316
317 pub async fn search(
319 &self,
320 query: &dyn crate::query::Query,
321 limit: usize,
322 ) -> Result<crate::query::SearchResponse> {
323 self.search_offset(query, limit, 0).await
324 }
325
326 pub async fn search_offset(
328 &self,
329 query: &dyn crate::query::Query,
330 limit: usize,
331 offset: usize,
332 ) -> Result<crate::query::SearchResponse> {
333 let reader = self.reader().await?;
334 let searcher = reader.searcher().await?;
335 let segments = searcher.segment_readers();
336
337 let fetch_limit = offset + limit;
338
339 let futures: Vec<_> = segments
340 .iter()
341 .map(|segment| {
342 let sid = segment.meta().id;
343 async move {
344 let results =
345 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
346 Ok::<_, crate::error::Error>(
347 results
348 .into_iter()
349 .map(move |r| (sid, r))
350 .collect::<Vec<_>>(),
351 )
352 }
353 })
354 .collect();
355
356 let batches = futures::future::try_join_all(futures).await?;
357 let mut all_results: Vec<(u128, crate::query::SearchResult)> =
358 Vec::with_capacity(batches.iter().map(|b| b.len()).sum());
359 for batch in batches {
360 all_results.extend(batch);
361 }
362
363 all_results.sort_by(|a, b| {
364 b.1.score
365 .partial_cmp(&a.1.score)
366 .unwrap_or(std::cmp::Ordering::Equal)
367 });
368
369 let total_hits = all_results.len() as u32;
370
371 let hits: Vec<crate::query::SearchHit> = all_results
372 .into_iter()
373 .skip(offset)
374 .take(limit)
375 .map(|(segment_id, result)| crate::query::SearchHit {
376 address: crate::query::DocAddress::new(segment_id, result.doc_id),
377 score: result.score,
378 matched_fields: result.extract_ordinals(),
379 })
380 .collect();
381
382 Ok(crate::query::SearchResponse { hits, total_hits })
383 }
384
385 pub async fn get_document(
387 &self,
388 address: &crate::query::DocAddress,
389 ) -> Result<Option<crate::dsl::Document>> {
390 let segment_id = address.segment_id_u128().ok_or_else(|| {
391 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
392 })?;
393
394 let reader = self.reader().await?;
395 let searcher = reader.searcher().await?;
396
397 for segment in searcher.segment_readers() {
398 if segment.meta().id == segment_id {
399 let local_doc_id = address.doc_id.wrapping_sub(segment.doc_id_offset());
401 return segment.doc(local_doc_id).await;
402 }
403 }
404
405 Ok(None)
406 }
407
408 pub async fn reload(&self) -> Result<()> {
410 Ok(())
412 }
413
414 pub async fn get_postings(
416 &self,
417 field: crate::Field,
418 term: &[u8],
419 ) -> Result<
420 Vec<(
421 Arc<crate::segment::SegmentReader>,
422 crate::structures::BlockPostingList,
423 )>,
424 > {
425 let segments = self.segment_readers().await?;
426 let mut results = Vec::new();
427
428 for segment in segments {
429 if let Some(postings) = segment.get_postings(field, term).await? {
430 results.push((segment, postings));
431 }
432 }
433
434 Ok(results)
435 }
436}
437
438#[cfg(feature = "native")]
440impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
441 pub fn writer(&self) -> writer::IndexWriter<D> {
443 writer::IndexWriter::from_index(self)
444 }
445}
446
447#[cfg(test)]
450mod tests {
451 use super::*;
452 use crate::directories::RamDirectory;
453 use crate::dsl::{Document, SchemaBuilder};
454
455 #[tokio::test]
456 async fn test_index_create_and_search() {
457 let mut schema_builder = SchemaBuilder::default();
458 let title = schema_builder.add_text_field("title", true, true);
459 let body = schema_builder.add_text_field("body", true, true);
460 let schema = schema_builder.build();
461
462 let dir = RamDirectory::new();
463 let config = IndexConfig::default();
464
465 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
467 .await
468 .unwrap();
469
470 let mut doc1 = Document::new();
471 doc1.add_text(title, "Hello World");
472 doc1.add_text(body, "This is the first document");
473 writer.add_document(doc1).unwrap();
474
475 let mut doc2 = Document::new();
476 doc2.add_text(title, "Goodbye World");
477 doc2.add_text(body, "This is the second document");
478 writer.add_document(doc2).unwrap();
479
480 writer.commit().await.unwrap();
481
482 let index = Index::open(dir, config).await.unwrap();
484 assert_eq!(index.num_docs().await.unwrap(), 2);
485
486 let postings = index.get_postings(title, b"world").await.unwrap();
488 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
493 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
494 }
495
496 #[tokio::test]
497 async fn test_multiple_segments() {
498 let mut schema_builder = SchemaBuilder::default();
499 let title = schema_builder.add_text_field("title", true, true);
500 let schema = schema_builder.build();
501
502 let dir = RamDirectory::new();
503 let config = IndexConfig {
504 max_indexing_memory_bytes: 1024, ..Default::default()
506 };
507
508 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
509 .await
510 .unwrap();
511
512 for batch in 0..3 {
514 for i in 0..5 {
515 let mut doc = Document::new();
516 doc.add_text(title, format!("Document {} batch {}", i, batch));
517 writer.add_document(doc).unwrap();
518 }
519 writer.commit().await.unwrap();
520 }
521
522 let index = Index::open(dir, config).await.unwrap();
524 assert_eq!(index.num_docs().await.unwrap(), 15);
525 assert!(
527 index.segment_readers().await.unwrap().len() >= 2,
528 "Expected multiple segments"
529 );
530 }
531
532 #[tokio::test]
533 async fn test_segment_merge() {
534 let mut schema_builder = SchemaBuilder::default();
535 let title = schema_builder.add_text_field("title", true, true);
536 let schema = schema_builder.build();
537
538 let dir = RamDirectory::new();
539 let config = IndexConfig {
540 max_indexing_memory_bytes: 512, ..Default::default()
542 };
543
544 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
545 .await
546 .unwrap();
547
548 for batch in 0..3 {
550 for i in 0..3 {
551 let mut doc = Document::new();
552 doc.add_text(title, format!("Document {} batch {}", i, batch));
553 writer.add_document(doc).unwrap();
554 }
555 writer.flush().await.unwrap();
556 }
557 writer.commit().await.unwrap();
558
559 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
561 assert!(
562 index.segment_readers().await.unwrap().len() >= 2,
563 "Expected multiple segments"
564 );
565
566 let writer = IndexWriter::open(dir.clone(), config.clone())
568 .await
569 .unwrap();
570 writer.force_merge().await.unwrap();
571
572 let index = Index::open(dir, config).await.unwrap();
574 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
575 assert_eq!(index.num_docs().await.unwrap(), 9);
576
577 let mut found_docs = 0;
579 for i in 0..9 {
580 if index.doc(i).await.unwrap().is_some() {
581 found_docs += 1;
582 }
583 }
584 assert_eq!(found_docs, 9);
585 }
586
587 #[tokio::test]
588 async fn test_match_query() {
589 let mut schema_builder = SchemaBuilder::default();
590 let title = schema_builder.add_text_field("title", true, true);
591 let body = schema_builder.add_text_field("body", true, true);
592 let schema = schema_builder.build();
593
594 let dir = RamDirectory::new();
595 let config = IndexConfig::default();
596
597 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
598 .await
599 .unwrap();
600
601 let mut doc1 = Document::new();
602 doc1.add_text(title, "rust programming");
603 doc1.add_text(body, "Learn rust language");
604 writer.add_document(doc1).unwrap();
605
606 let mut doc2 = Document::new();
607 doc2.add_text(title, "python programming");
608 doc2.add_text(body, "Learn python language");
609 writer.add_document(doc2).unwrap();
610
611 writer.commit().await.unwrap();
612
613 let index = Index::open(dir, config).await.unwrap();
614
615 let results = index.query("rust", 10).await.unwrap();
617 assert_eq!(results.hits.len(), 1);
618
619 let results = index.query("rust programming", 10).await.unwrap();
621 assert!(!results.hits.is_empty());
622
623 let hit = &results.hits[0];
625 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
626
627 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
629 assert!(
630 !doc.field_values().is_empty(),
631 "Doc should have field values"
632 );
633
634 let doc = index.doc(0).await.unwrap().unwrap();
636 assert!(
637 !doc.field_values().is_empty(),
638 "Doc should have field values"
639 );
640 }
641
642 #[tokio::test]
643 async fn test_slice_cache_warmup_and_load() {
644 use crate::directories::SliceCachingDirectory;
645
646 let mut schema_builder = SchemaBuilder::default();
647 let title = schema_builder.add_text_field("title", true, true);
648 let body = schema_builder.add_text_field("body", true, true);
649 let schema = schema_builder.build();
650
651 let dir = RamDirectory::new();
652 let config = IndexConfig::default();
653
654 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
656 .await
657 .unwrap();
658
659 for i in 0..10 {
660 let mut doc = Document::new();
661 doc.add_text(title, format!("Document {} about rust", i));
662 doc.add_text(body, format!("This is body text number {}", i));
663 writer.add_document(doc).unwrap();
664 }
665 writer.commit().await.unwrap();
666
667 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
669 let index = Index::open(caching_dir, config.clone()).await.unwrap();
670
671 let results = index.query("rust", 10).await.unwrap();
673 assert!(!results.hits.is_empty());
674
675 let stats = index.directory.stats();
677 assert!(stats.total_bytes > 0, "Cache should have data after search");
678 }
679
680 #[tokio::test]
681 async fn test_multivalue_field_indexing_and_search() {
682 let mut schema_builder = SchemaBuilder::default();
683 let uris = schema_builder.add_text_field("uris", true, true);
684 let title = schema_builder.add_text_field("title", true, true);
685 let schema = schema_builder.build();
686
687 let dir = RamDirectory::new();
688 let config = IndexConfig::default();
689
690 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
692 .await
693 .unwrap();
694
695 let mut doc = Document::new();
696 doc.add_text(uris, "one");
697 doc.add_text(uris, "two");
698 doc.add_text(title, "Test Document");
699 writer.add_document(doc).unwrap();
700
701 let mut doc2 = Document::new();
703 doc2.add_text(uris, "three");
704 doc2.add_text(title, "Another Document");
705 writer.add_document(doc2).unwrap();
706
707 writer.commit().await.unwrap();
708
709 let index = Index::open(dir, config).await.unwrap();
711 assert_eq!(index.num_docs().await.unwrap(), 2);
712
713 let doc = index.doc(0).await.unwrap().unwrap();
715 let all_uris: Vec<_> = doc.get_all(uris).collect();
716 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
717 assert_eq!(all_uris[0].as_text(), Some("one"));
718 assert_eq!(all_uris[1].as_text(), Some("two"));
719
720 let json = doc.to_json(index.schema());
722 let uris_json = json.get("uris").unwrap();
723 assert!(uris_json.is_array(), "Multi-value field should be an array");
724 let uris_arr = uris_json.as_array().unwrap();
725 assert_eq!(uris_arr.len(), 2);
726 assert_eq!(uris_arr[0].as_str(), Some("one"));
727 assert_eq!(uris_arr[1].as_str(), Some("two"));
728
729 let results = index.query("uris:one", 10).await.unwrap();
731 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
732 assert_eq!(results.hits[0].address.doc_id, 0);
733
734 let results = index.query("uris:two", 10).await.unwrap();
735 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
736 assert_eq!(results.hits[0].address.doc_id, 0);
737
738 let results = index.query("uris:three", 10).await.unwrap();
739 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
740 assert_eq!(results.hits[0].address.doc_id, 1);
741
742 let results = index.query("uris:nonexistent", 10).await.unwrap();
744 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
745 }
746
747 #[tokio::test]
754 async fn test_wand_optimization_for_or_queries() {
755 use crate::query::{BooleanQuery, TermQuery};
756
757 let mut schema_builder = SchemaBuilder::default();
758 let content = schema_builder.add_text_field("content", true, true);
759 let schema = schema_builder.build();
760
761 let dir = RamDirectory::new();
762 let config = IndexConfig::default();
763
764 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
766 .await
767 .unwrap();
768
769 let mut doc = Document::new();
771 doc.add_text(content, "rust programming language is fast");
772 writer.add_document(doc).unwrap();
773
774 let mut doc = Document::new();
776 doc.add_text(content, "rust is a systems language");
777 writer.add_document(doc).unwrap();
778
779 let mut doc = Document::new();
781 doc.add_text(content, "programming is fun");
782 writer.add_document(doc).unwrap();
783
784 let mut doc = Document::new();
786 doc.add_text(content, "python is easy to learn");
787 writer.add_document(doc).unwrap();
788
789 let mut doc = Document::new();
791 doc.add_text(content, "rust rust programming programming systems");
792 writer.add_document(doc).unwrap();
793
794 writer.commit().await.unwrap();
795
796 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
798
799 let or_query = BooleanQuery::new()
801 .should(TermQuery::text(content, "rust"))
802 .should(TermQuery::text(content, "programming"));
803
804 let results = index.search(&or_query, 10).await.unwrap();
805
806 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
808
809 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
810 assert!(doc_ids.contains(&0), "Should find doc 0");
811 assert!(doc_ids.contains(&1), "Should find doc 1");
812 assert!(doc_ids.contains(&2), "Should find doc 2");
813 assert!(doc_ids.contains(&4), "Should find doc 4");
814 assert!(
815 !doc_ids.contains(&3),
816 "Should NOT find doc 3 (only has 'python')"
817 );
818
819 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
821
822 let results = index.search(&single_query, 10).await.unwrap();
823 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
824
825 let must_query = BooleanQuery::new()
827 .must(TermQuery::text(content, "rust"))
828 .should(TermQuery::text(content, "programming"));
829
830 let results = index.search(&must_query, 10).await.unwrap();
831 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
833
834 let must_not_query = BooleanQuery::new()
836 .should(TermQuery::text(content, "rust"))
837 .should(TermQuery::text(content, "programming"))
838 .must_not(TermQuery::text(content, "systems"));
839
840 let results = index.search(&must_not_query, 10).await.unwrap();
841 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
843 assert!(
844 !doc_ids.contains(&1),
845 "Should NOT find doc 1 (has 'systems')"
846 );
847 assert!(
848 !doc_ids.contains(&4),
849 "Should NOT find doc 4 (has 'systems')"
850 );
851
852 let or_query = BooleanQuery::new()
854 .should(TermQuery::text(content, "rust"))
855 .should(TermQuery::text(content, "programming"));
856
857 let results = index.search(&or_query, 2).await.unwrap();
858 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
859
860 }
863
864 #[tokio::test]
866 async fn test_wand_results_match_standard_boolean() {
867 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
868
869 let mut schema_builder = SchemaBuilder::default();
870 let content = schema_builder.add_text_field("content", true, true);
871 let schema = schema_builder.build();
872
873 let dir = RamDirectory::new();
874 let config = IndexConfig::default();
875
876 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
877 .await
878 .unwrap();
879
880 for i in 0..10 {
882 let mut doc = Document::new();
883 let text = match i % 4 {
884 0 => "apple banana cherry",
885 1 => "apple orange",
886 2 => "banana grape",
887 _ => "cherry date",
888 };
889 doc.add_text(content, text);
890 writer.add_document(doc).unwrap();
891 }
892
893 writer.commit().await.unwrap();
894 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
895
896 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
898
899 let bool_query = BooleanQuery::new()
900 .should(TermQuery::text(content, "apple"))
901 .should(TermQuery::text(content, "banana"));
902
903 let wand_results = index.search(&wand_query, 10).await.unwrap();
904 let bool_results = index.search(&bool_query, 10).await.unwrap();
905
906 assert_eq!(
908 wand_results.hits.len(),
909 bool_results.hits.len(),
910 "WAND and Boolean should find same number of docs"
911 );
912
913 let wand_docs: std::collections::HashSet<u32> =
914 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
915 let bool_docs: std::collections::HashSet<u32> =
916 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
917
918 assert_eq!(
919 wand_docs, bool_docs,
920 "WAND and Boolean should find same documents"
921 );
922 }
923
924 #[tokio::test]
925 async fn test_vector_index_threshold_switch() {
926 use crate::dsl::{DenseVectorConfig, DenseVectorQuantization, VectorIndexType};
927
928 let mut schema_builder = SchemaBuilder::default();
930 let title = schema_builder.add_text_field("title", true, true);
931 let embedding = schema_builder.add_dense_vector_field_with_config(
932 "embedding",
933 true, true, DenseVectorConfig {
936 dim: 8,
937 index_type: VectorIndexType::IvfRaBitQ,
938 quantization: DenseVectorQuantization::F32,
939 num_clusters: Some(4), nprobe: 2,
941 build_threshold: Some(50), },
943 );
944 let schema = schema_builder.build();
945
946 let dir = RamDirectory::new();
947 let config = IndexConfig::default();
948
949 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
951 .await
952 .unwrap();
953
954 for i in 0..30 {
956 let mut doc = Document::new();
957 doc.add_text(title, format!("Document {}", i));
958 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
960 doc.add_dense_vector(embedding, vec);
961 writer.add_document(doc).unwrap();
962 }
963 writer.commit().await.unwrap();
964
965 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
967 assert!(
968 index.trained_centroids.is_empty(),
969 "Should not have trained centroids below threshold"
970 );
971
972 let query_vec: Vec<f32> = vec![0.5; 8];
974 let segments = index.segment_readers().await.unwrap();
975 assert!(!segments.is_empty());
976
977 let results = segments[0]
978 .search_dense_vector(
979 embedding,
980 &query_vec,
981 5,
982 0,
983 1,
984 crate::query::MultiValueCombiner::Max,
985 )
986 .await
987 .unwrap();
988 assert!(!results.is_empty(), "Flat search should return results");
989
990 let writer = IndexWriter::open(dir.clone(), config.clone())
992 .await
993 .unwrap();
994
995 for i in 30..60 {
997 let mut doc = Document::new();
998 doc.add_text(title, format!("Document {}", i));
999 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
1000 doc.add_dense_vector(embedding, vec);
1001 writer.add_document(doc).unwrap();
1002 }
1003 writer.commit().await.unwrap();
1005
1006 assert!(
1008 writer.is_vector_index_built(embedding).await,
1009 "Vector index should be built after crossing threshold"
1010 );
1011
1012 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
1014 assert!(
1015 index.trained_centroids.contains_key(&embedding.0),
1016 "Should have loaded trained centroids for embedding field"
1017 );
1018
1019 let segments = index.segment_readers().await.unwrap();
1021 let results = segments[0]
1022 .search_dense_vector(
1023 embedding,
1024 &query_vec,
1025 5,
1026 0,
1027 1,
1028 crate::query::MultiValueCombiner::Max,
1029 )
1030 .await
1031 .unwrap();
1032 assert!(
1033 !results.is_empty(),
1034 "Search should return results after build"
1035 );
1036
1037 let writer = IndexWriter::open(dir.clone(), config.clone())
1039 .await
1040 .unwrap();
1041 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1045 }
1046}