1#[cfg(feature = "native")]
11use crate::dsl::Schema;
12#[cfg(feature = "native")]
13use crate::error::Result;
14#[cfg(feature = "native")]
15use crate::structures::{CoarseCentroids, PQCodebook};
16#[cfg(feature = "native")]
17use rustc_hash::FxHashMap;
18#[cfg(feature = "native")]
19use std::sync::Arc;
20
21mod searcher;
22pub use searcher::Searcher;
23
24#[cfg(feature = "native")]
25mod reader;
26#[cfg(feature = "native")]
27mod vector_builder;
28#[cfg(feature = "native")]
29mod writer;
30#[cfg(feature = "native")]
31pub use reader::IndexReader;
32#[cfg(feature = "native")]
33pub use writer::IndexWriter;
34
35mod metadata;
36pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
37
38#[cfg(feature = "native")]
39mod helpers;
40#[cfg(feature = "native")]
41pub use helpers::{
42 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
43 index_documents_from_reader, index_json_document, parse_schema,
44};
45
46pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
48
49#[derive(Debug, Clone)]
51pub struct IndexConfig {
52 pub num_threads: usize,
54 pub num_indexing_threads: usize,
56 pub num_compression_threads: usize,
58 pub term_cache_blocks: usize,
60 pub store_cache_blocks: usize,
62 pub max_indexing_memory_bytes: usize,
64 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
66 pub optimization: crate::structures::IndexOptimization,
68 pub reload_interval_ms: u64,
70}
71
72impl Default for IndexConfig {
73 fn default() -> Self {
74 #[cfg(feature = "native")]
75 let cpus = num_cpus::get().max(1);
76 #[cfg(not(feature = "native"))]
77 let cpus = 1;
78
79 Self {
80 num_threads: cpus,
81 num_indexing_threads: 1,
82 num_compression_threads: cpus,
83 term_cache_blocks: 256,
84 store_cache_blocks: 32,
85 max_indexing_memory_bytes: 256 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
87 optimization: crate::structures::IndexOptimization::default(),
88 reload_interval_ms: 1000, }
90 }
91}
92
93#[cfg(feature = "native")]
102pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
103 directory: Arc<D>,
104 schema: Arc<Schema>,
105 config: IndexConfig,
106 segment_manager: Arc<crate::merge::SegmentManager<D>>,
108 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
110 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
112 cached_reader: tokio::sync::OnceCell<IndexReader<D>>,
114}
115
116#[cfg(feature = "native")]
117impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
118 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
120 let directory = Arc::new(directory);
121 let schema = Arc::new(schema);
122 let metadata = IndexMetadata::new((*schema).clone());
123
124 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
125 Arc::clone(&directory),
126 Arc::clone(&schema),
127 metadata,
128 config.merge_policy.clone_box(),
129 config.term_cache_blocks,
130 ));
131
132 segment_manager.update_metadata(|_| {}).await?;
134
135 Ok(Self {
136 directory,
137 schema,
138 config,
139 segment_manager,
140 trained_centroids: FxHashMap::default(),
141 trained_codebooks: FxHashMap::default(),
142 cached_reader: tokio::sync::OnceCell::new(),
143 })
144 }
145
146 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
148 let directory = Arc::new(directory);
149
150 let metadata = IndexMetadata::load(directory.as_ref()).await?;
152 let schema = Arc::new(metadata.schema.clone());
153
154 let trained = metadata.load_trained_structures(directory.as_ref()).await;
156 let trained_centroids = trained
157 .as_ref()
158 .map(|t| t.centroids.clone())
159 .unwrap_or_default();
160 let trained_codebooks = trained
161 .as_ref()
162 .map(|t| t.codebooks.clone())
163 .unwrap_or_default();
164
165 log::info!(
166 "[Index::open] trained_centroids fields={:?}, trained_codebooks fields={:?}",
167 trained_centroids.keys().collect::<Vec<_>>(),
168 trained_codebooks.keys().collect::<Vec<_>>(),
169 );
170
171 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
172 Arc::clone(&directory),
173 Arc::clone(&schema),
174 metadata,
175 config.merge_policy.clone_box(),
176 config.term_cache_blocks,
177 ));
178
179 Ok(Self {
180 directory,
181 schema,
182 config,
183 segment_manager,
184 trained_centroids,
185 trained_codebooks,
186 cached_reader: tokio::sync::OnceCell::new(),
187 })
188 }
189
190 pub fn schema(&self) -> &Schema {
192 &self.schema
193 }
194
195 pub fn directory(&self) -> &D {
197 &self.directory
198 }
199
200 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
202 &self.segment_manager
203 }
204
205 pub async fn reader(&self) -> Result<&IndexReader<D>> {
210 self.cached_reader
211 .get_or_try_init(|| async {
212 IndexReader::from_segment_manager_with_reload_interval(
213 Arc::clone(&self.schema),
214 Arc::clone(&self.segment_manager),
215 self.trained_centroids.clone(),
216 self.config.term_cache_blocks,
217 self.config.reload_interval_ms,
218 )
219 .await
220 })
221 .await
222 }
223
224 pub fn config(&self) -> &IndexConfig {
226 &self.config
227 }
228
229 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
231 &self.trained_centroids
232 }
233
234 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
236 &self.trained_codebooks
237 }
238
239 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
241 let reader = self.reader().await?;
242 let searcher = reader.searcher().await?;
243 Ok(searcher.segment_readers().to_vec())
244 }
245
246 pub async fn num_docs(&self) -> Result<u32> {
248 let reader = self.reader().await?;
249 let searcher = reader.searcher().await?;
250 Ok(searcher.num_docs())
251 }
252
253 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
255 let reader = self.reader().await?;
256 let searcher = reader.searcher().await?;
257 searcher.doc(doc_id).await
258 }
259
260 pub fn default_fields(&self) -> Vec<crate::Field> {
262 if !self.schema.default_fields().is_empty() {
263 self.schema.default_fields().to_vec()
264 } else {
265 self.schema
266 .fields()
267 .filter(|(_, entry)| {
268 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
269 })
270 .map(|(field, _)| field)
271 .collect()
272 }
273 }
274
275 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
277 Arc::new(crate::tokenizer::TokenizerRegistry::default())
278 }
279
280 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
282 let default_fields = self.default_fields();
283 let tokenizers = self.tokenizers();
284
285 let query_routers = self.schema.query_routers();
286 if !query_routers.is_empty()
287 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
288 {
289 return crate::dsl::QueryLanguageParser::with_router(
290 Arc::clone(&self.schema),
291 default_fields,
292 tokenizers,
293 router,
294 );
295 }
296
297 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
298 }
299
300 pub async fn query(
302 &self,
303 query_str: &str,
304 limit: usize,
305 ) -> Result<crate::query::SearchResponse> {
306 self.query_offset(query_str, limit, 0).await
307 }
308
309 pub async fn query_offset(
311 &self,
312 query_str: &str,
313 limit: usize,
314 offset: usize,
315 ) -> Result<crate::query::SearchResponse> {
316 let parser = self.query_parser();
317 let query = parser
318 .parse(query_str)
319 .map_err(crate::error::Error::Query)?;
320 self.search_offset(query.as_ref(), limit, offset).await
321 }
322
323 pub async fn search(
325 &self,
326 query: &dyn crate::query::Query,
327 limit: usize,
328 ) -> Result<crate::query::SearchResponse> {
329 self.search_offset(query, limit, 0).await
330 }
331
332 pub async fn search_offset(
334 &self,
335 query: &dyn crate::query::Query,
336 limit: usize,
337 offset: usize,
338 ) -> Result<crate::query::SearchResponse> {
339 let reader = self.reader().await?;
340 let searcher = reader.searcher().await?;
341 let segments = searcher.segment_readers();
342
343 let fetch_limit = offset + limit;
344
345 let futures: Vec<_> = segments
346 .iter()
347 .map(|segment| {
348 let sid = segment.meta().id;
349 async move {
350 let results =
351 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
352 Ok::<_, crate::error::Error>(
353 results
354 .into_iter()
355 .map(move |r| (sid, r))
356 .collect::<Vec<_>>(),
357 )
358 }
359 })
360 .collect();
361
362 let batches = futures::future::try_join_all(futures).await?;
363 let mut all_results: Vec<(u128, crate::query::SearchResult)> =
364 Vec::with_capacity(batches.iter().map(|b| b.len()).sum());
365 for batch in batches {
366 all_results.extend(batch);
367 }
368
369 all_results.sort_by(|a, b| {
370 b.1.score
371 .partial_cmp(&a.1.score)
372 .unwrap_or(std::cmp::Ordering::Equal)
373 });
374
375 let total_hits = all_results.len() as u32;
376
377 let hits: Vec<crate::query::SearchHit> = all_results
378 .into_iter()
379 .skip(offset)
380 .take(limit)
381 .map(|(segment_id, result)| crate::query::SearchHit {
382 address: crate::query::DocAddress::new(segment_id, result.doc_id),
383 score: result.score,
384 matched_fields: result.extract_ordinals(),
385 })
386 .collect();
387
388 Ok(crate::query::SearchResponse { hits, total_hits })
389 }
390
391 pub async fn get_document(
393 &self,
394 address: &crate::query::DocAddress,
395 ) -> Result<Option<crate::dsl::Document>> {
396 let segment_id = address.segment_id_u128().ok_or_else(|| {
397 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
398 })?;
399
400 let reader = self.reader().await?;
401 let searcher = reader.searcher().await?;
402
403 for segment in searcher.segment_readers() {
404 if segment.meta().id == segment_id {
405 let local_doc_id = address.doc_id.wrapping_sub(segment.doc_id_offset());
407 return segment.doc(local_doc_id).await;
408 }
409 }
410
411 Ok(None)
412 }
413
414 pub async fn reload(&self) -> Result<()> {
416 Ok(())
418 }
419
420 pub async fn get_postings(
422 &self,
423 field: crate::Field,
424 term: &[u8],
425 ) -> Result<
426 Vec<(
427 Arc<crate::segment::SegmentReader>,
428 crate::structures::BlockPostingList,
429 )>,
430 > {
431 let segments = self.segment_readers().await?;
432 let mut results = Vec::new();
433
434 for segment in segments {
435 if let Some(postings) = segment.get_postings(field, term).await? {
436 results.push((segment, postings));
437 }
438 }
439
440 Ok(results)
441 }
442}
443
444#[cfg(feature = "native")]
446impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
447 pub fn writer(&self) -> writer::IndexWriter<D> {
449 writer::IndexWriter::from_index(self)
450 }
451}
452
453#[cfg(test)]
456mod tests {
457 use super::*;
458 use crate::directories::RamDirectory;
459 use crate::dsl::{Document, SchemaBuilder};
460
461 #[tokio::test]
462 async fn test_index_create_and_search() {
463 let mut schema_builder = SchemaBuilder::default();
464 let title = schema_builder.add_text_field("title", true, true);
465 let body = schema_builder.add_text_field("body", true, true);
466 let schema = schema_builder.build();
467
468 let dir = RamDirectory::new();
469 let config = IndexConfig::default();
470
471 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
473 .await
474 .unwrap();
475
476 let mut doc1 = Document::new();
477 doc1.add_text(title, "Hello World");
478 doc1.add_text(body, "This is the first document");
479 writer.add_document(doc1).unwrap();
480
481 let mut doc2 = Document::new();
482 doc2.add_text(title, "Goodbye World");
483 doc2.add_text(body, "This is the second document");
484 writer.add_document(doc2).unwrap();
485
486 writer.commit().await.unwrap();
487
488 let index = Index::open(dir, config).await.unwrap();
490 assert_eq!(index.num_docs().await.unwrap(), 2);
491
492 let postings = index.get_postings(title, b"world").await.unwrap();
494 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
499 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
500 }
501
502 #[tokio::test]
503 async fn test_multiple_segments() {
504 let mut schema_builder = SchemaBuilder::default();
505 let title = schema_builder.add_text_field("title", true, true);
506 let schema = schema_builder.build();
507
508 let dir = RamDirectory::new();
509 let config = IndexConfig {
510 max_indexing_memory_bytes: 1024, ..Default::default()
512 };
513
514 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
515 .await
516 .unwrap();
517
518 for batch in 0..3 {
520 for i in 0..5 {
521 let mut doc = Document::new();
522 doc.add_text(title, format!("Document {} batch {}", i, batch));
523 writer.add_document(doc).unwrap();
524 }
525 writer.commit().await.unwrap();
526 }
527
528 let index = Index::open(dir, config).await.unwrap();
530 assert_eq!(index.num_docs().await.unwrap(), 15);
531 assert!(
533 index.segment_readers().await.unwrap().len() >= 2,
534 "Expected multiple segments"
535 );
536 }
537
538 #[tokio::test]
539 async fn test_segment_merge() {
540 let mut schema_builder = SchemaBuilder::default();
541 let title = schema_builder.add_text_field("title", true, true);
542 let schema = schema_builder.build();
543
544 let dir = RamDirectory::new();
545 let config = IndexConfig {
546 max_indexing_memory_bytes: 512, ..Default::default()
548 };
549
550 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
551 .await
552 .unwrap();
553
554 for batch in 0..3 {
556 for i in 0..3 {
557 let mut doc = Document::new();
558 doc.add_text(title, format!("Document {} batch {}", i, batch));
559 writer.add_document(doc).unwrap();
560 }
561 writer.flush().await.unwrap();
562 }
563 writer.commit().await.unwrap();
564
565 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
567 assert!(
568 index.segment_readers().await.unwrap().len() >= 2,
569 "Expected multiple segments"
570 );
571
572 let writer = IndexWriter::open(dir.clone(), config.clone())
574 .await
575 .unwrap();
576 writer.force_merge().await.unwrap();
577
578 let index = Index::open(dir, config).await.unwrap();
580 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
581 assert_eq!(index.num_docs().await.unwrap(), 9);
582
583 let mut found_docs = 0;
585 for i in 0..9 {
586 if index.doc(i).await.unwrap().is_some() {
587 found_docs += 1;
588 }
589 }
590 assert_eq!(found_docs, 9);
591 }
592
593 #[tokio::test]
594 async fn test_match_query() {
595 let mut schema_builder = SchemaBuilder::default();
596 let title = schema_builder.add_text_field("title", true, true);
597 let body = schema_builder.add_text_field("body", true, true);
598 let schema = schema_builder.build();
599
600 let dir = RamDirectory::new();
601 let config = IndexConfig::default();
602
603 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
604 .await
605 .unwrap();
606
607 let mut doc1 = Document::new();
608 doc1.add_text(title, "rust programming");
609 doc1.add_text(body, "Learn rust language");
610 writer.add_document(doc1).unwrap();
611
612 let mut doc2 = Document::new();
613 doc2.add_text(title, "python programming");
614 doc2.add_text(body, "Learn python language");
615 writer.add_document(doc2).unwrap();
616
617 writer.commit().await.unwrap();
618
619 let index = Index::open(dir, config).await.unwrap();
620
621 let results = index.query("rust", 10).await.unwrap();
623 assert_eq!(results.hits.len(), 1);
624
625 let results = index.query("rust programming", 10).await.unwrap();
627 assert!(!results.hits.is_empty());
628
629 let hit = &results.hits[0];
631 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
632
633 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
635 assert!(
636 !doc.field_values().is_empty(),
637 "Doc should have field values"
638 );
639
640 let doc = index.doc(0).await.unwrap().unwrap();
642 assert!(
643 !doc.field_values().is_empty(),
644 "Doc should have field values"
645 );
646 }
647
648 #[tokio::test]
649 async fn test_slice_cache_warmup_and_load() {
650 use crate::directories::SliceCachingDirectory;
651
652 let mut schema_builder = SchemaBuilder::default();
653 let title = schema_builder.add_text_field("title", true, true);
654 let body = schema_builder.add_text_field("body", true, true);
655 let schema = schema_builder.build();
656
657 let dir = RamDirectory::new();
658 let config = IndexConfig::default();
659
660 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
662 .await
663 .unwrap();
664
665 for i in 0..10 {
666 let mut doc = Document::new();
667 doc.add_text(title, format!("Document {} about rust", i));
668 doc.add_text(body, format!("This is body text number {}", i));
669 writer.add_document(doc).unwrap();
670 }
671 writer.commit().await.unwrap();
672
673 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
675 let index = Index::open(caching_dir, config.clone()).await.unwrap();
676
677 let results = index.query("rust", 10).await.unwrap();
679 assert!(!results.hits.is_empty());
680
681 let stats = index.directory.stats();
683 assert!(stats.total_bytes > 0, "Cache should have data after search");
684 }
685
686 #[tokio::test]
687 async fn test_multivalue_field_indexing_and_search() {
688 let mut schema_builder = SchemaBuilder::default();
689 let uris = schema_builder.add_text_field("uris", true, true);
690 let title = schema_builder.add_text_field("title", true, true);
691 let schema = schema_builder.build();
692
693 let dir = RamDirectory::new();
694 let config = IndexConfig::default();
695
696 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
698 .await
699 .unwrap();
700
701 let mut doc = Document::new();
702 doc.add_text(uris, "one");
703 doc.add_text(uris, "two");
704 doc.add_text(title, "Test Document");
705 writer.add_document(doc).unwrap();
706
707 let mut doc2 = Document::new();
709 doc2.add_text(uris, "three");
710 doc2.add_text(title, "Another Document");
711 writer.add_document(doc2).unwrap();
712
713 writer.commit().await.unwrap();
714
715 let index = Index::open(dir, config).await.unwrap();
717 assert_eq!(index.num_docs().await.unwrap(), 2);
718
719 let doc = index.doc(0).await.unwrap().unwrap();
721 let all_uris: Vec<_> = doc.get_all(uris).collect();
722 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
723 assert_eq!(all_uris[0].as_text(), Some("one"));
724 assert_eq!(all_uris[1].as_text(), Some("two"));
725
726 let json = doc.to_json(index.schema());
728 let uris_json = json.get("uris").unwrap();
729 assert!(uris_json.is_array(), "Multi-value field should be an array");
730 let uris_arr = uris_json.as_array().unwrap();
731 assert_eq!(uris_arr.len(), 2);
732 assert_eq!(uris_arr[0].as_str(), Some("one"));
733 assert_eq!(uris_arr[1].as_str(), Some("two"));
734
735 let results = index.query("uris:one", 10).await.unwrap();
737 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
738 assert_eq!(results.hits[0].address.doc_id, 0);
739
740 let results = index.query("uris:two", 10).await.unwrap();
741 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
742 assert_eq!(results.hits[0].address.doc_id, 0);
743
744 let results = index.query("uris:three", 10).await.unwrap();
745 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
746 assert_eq!(results.hits[0].address.doc_id, 1);
747
748 let results = index.query("uris:nonexistent", 10).await.unwrap();
750 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
751 }
752
753 #[tokio::test]
760 async fn test_wand_optimization_for_or_queries() {
761 use crate::query::{BooleanQuery, TermQuery};
762
763 let mut schema_builder = SchemaBuilder::default();
764 let content = schema_builder.add_text_field("content", true, true);
765 let schema = schema_builder.build();
766
767 let dir = RamDirectory::new();
768 let config = IndexConfig::default();
769
770 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
772 .await
773 .unwrap();
774
775 let mut doc = Document::new();
777 doc.add_text(content, "rust programming language is fast");
778 writer.add_document(doc).unwrap();
779
780 let mut doc = Document::new();
782 doc.add_text(content, "rust is a systems language");
783 writer.add_document(doc).unwrap();
784
785 let mut doc = Document::new();
787 doc.add_text(content, "programming is fun");
788 writer.add_document(doc).unwrap();
789
790 let mut doc = Document::new();
792 doc.add_text(content, "python is easy to learn");
793 writer.add_document(doc).unwrap();
794
795 let mut doc = Document::new();
797 doc.add_text(content, "rust rust programming programming systems");
798 writer.add_document(doc).unwrap();
799
800 writer.commit().await.unwrap();
801
802 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
804
805 let or_query = BooleanQuery::new()
807 .should(TermQuery::text(content, "rust"))
808 .should(TermQuery::text(content, "programming"));
809
810 let results = index.search(&or_query, 10).await.unwrap();
811
812 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
814
815 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
816 assert!(doc_ids.contains(&0), "Should find doc 0");
817 assert!(doc_ids.contains(&1), "Should find doc 1");
818 assert!(doc_ids.contains(&2), "Should find doc 2");
819 assert!(doc_ids.contains(&4), "Should find doc 4");
820 assert!(
821 !doc_ids.contains(&3),
822 "Should NOT find doc 3 (only has 'python')"
823 );
824
825 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
827
828 let results = index.search(&single_query, 10).await.unwrap();
829 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
830
831 let must_query = BooleanQuery::new()
833 .must(TermQuery::text(content, "rust"))
834 .should(TermQuery::text(content, "programming"));
835
836 let results = index.search(&must_query, 10).await.unwrap();
837 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
839
840 let must_not_query = BooleanQuery::new()
842 .should(TermQuery::text(content, "rust"))
843 .should(TermQuery::text(content, "programming"))
844 .must_not(TermQuery::text(content, "systems"));
845
846 let results = index.search(&must_not_query, 10).await.unwrap();
847 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
849 assert!(
850 !doc_ids.contains(&1),
851 "Should NOT find doc 1 (has 'systems')"
852 );
853 assert!(
854 !doc_ids.contains(&4),
855 "Should NOT find doc 4 (has 'systems')"
856 );
857
858 let or_query = BooleanQuery::new()
860 .should(TermQuery::text(content, "rust"))
861 .should(TermQuery::text(content, "programming"));
862
863 let results = index.search(&or_query, 2).await.unwrap();
864 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
865
866 }
869
870 #[tokio::test]
872 async fn test_wand_results_match_standard_boolean() {
873 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
874
875 let mut schema_builder = SchemaBuilder::default();
876 let content = schema_builder.add_text_field("content", true, true);
877 let schema = schema_builder.build();
878
879 let dir = RamDirectory::new();
880 let config = IndexConfig::default();
881
882 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
883 .await
884 .unwrap();
885
886 for i in 0..10 {
888 let mut doc = Document::new();
889 let text = match i % 4 {
890 0 => "apple banana cherry",
891 1 => "apple orange",
892 2 => "banana grape",
893 _ => "cherry date",
894 };
895 doc.add_text(content, text);
896 writer.add_document(doc).unwrap();
897 }
898
899 writer.commit().await.unwrap();
900 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
901
902 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
904
905 let bool_query = BooleanQuery::new()
906 .should(TermQuery::text(content, "apple"))
907 .should(TermQuery::text(content, "banana"));
908
909 let wand_results = index.search(&wand_query, 10).await.unwrap();
910 let bool_results = index.search(&bool_query, 10).await.unwrap();
911
912 assert_eq!(
914 wand_results.hits.len(),
915 bool_results.hits.len(),
916 "WAND and Boolean should find same number of docs"
917 );
918
919 let wand_docs: std::collections::HashSet<u32> =
920 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
921 let bool_docs: std::collections::HashSet<u32> =
922 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
923
924 assert_eq!(
925 wand_docs, bool_docs,
926 "WAND and Boolean should find same documents"
927 );
928 }
929
930 #[tokio::test]
931 async fn test_vector_index_threshold_switch() {
932 use crate::dsl::{DenseVectorConfig, DenseVectorQuantization, VectorIndexType};
933
934 let mut schema_builder = SchemaBuilder::default();
936 let title = schema_builder.add_text_field("title", true, true);
937 let embedding = schema_builder.add_dense_vector_field_with_config(
938 "embedding",
939 true, true, DenseVectorConfig {
942 dim: 8,
943 index_type: VectorIndexType::IvfRaBitQ,
944 quantization: DenseVectorQuantization::F32,
945 num_clusters: Some(4), nprobe: 2,
947 build_threshold: Some(50), },
949 );
950 let schema = schema_builder.build();
951
952 let dir = RamDirectory::new();
953 let config = IndexConfig::default();
954
955 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
957 .await
958 .unwrap();
959
960 for i in 0..30 {
962 let mut doc = Document::new();
963 doc.add_text(title, format!("Document {}", i));
964 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
966 doc.add_dense_vector(embedding, vec);
967 writer.add_document(doc).unwrap();
968 }
969 writer.commit().await.unwrap();
970
971 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
973 assert!(
974 index.trained_centroids.is_empty(),
975 "Should not have trained centroids below threshold"
976 );
977
978 let query_vec: Vec<f32> = vec![0.5; 8];
980 let segments = index.segment_readers().await.unwrap();
981 assert!(!segments.is_empty());
982
983 let results = segments[0]
984 .search_dense_vector(
985 embedding,
986 &query_vec,
987 5,
988 0,
989 1,
990 crate::query::MultiValueCombiner::Max,
991 )
992 .await
993 .unwrap();
994 assert!(!results.is_empty(), "Flat search should return results");
995
996 let writer = IndexWriter::open(dir.clone(), config.clone())
998 .await
999 .unwrap();
1000
1001 for i in 30..60 {
1003 let mut doc = Document::new();
1004 doc.add_text(title, format!("Document {}", i));
1005 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
1006 doc.add_dense_vector(embedding, vec);
1007 writer.add_document(doc).unwrap();
1008 }
1009 writer.commit().await.unwrap();
1011
1012 assert!(
1014 writer.is_vector_index_built(embedding).await,
1015 "Vector index should be built after crossing threshold"
1016 );
1017
1018 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
1020 assert!(
1021 index.trained_centroids.contains_key(&embedding.0),
1022 "Should have loaded trained centroids for embedding field"
1023 );
1024
1025 let segments = index.segment_readers().await.unwrap();
1027 let results = segments[0]
1028 .search_dense_vector(
1029 embedding,
1030 &query_vec,
1031 5,
1032 0,
1033 1,
1034 crate::query::MultiValueCombiner::Max,
1035 )
1036 .await
1037 .unwrap();
1038 assert!(
1039 !results.is_empty(),
1040 "Search should return results after build"
1041 );
1042
1043 let writer = IndexWriter::open(dir.clone(), config.clone())
1045 .await
1046 .unwrap();
1047 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1051 }
1052}