1#[cfg(feature = "native")]
11use crate::dsl::Schema;
12#[cfg(feature = "native")]
13use crate::error::Result;
14#[cfg(feature = "native")]
15use crate::structures::{CoarseCentroids, PQCodebook};
16#[cfg(feature = "native")]
17use rustc_hash::FxHashMap;
18#[cfg(feature = "native")]
19use std::sync::Arc;
20
21mod searcher;
22pub use searcher::Searcher;
23
24#[cfg(feature = "native")]
25mod reader;
26#[cfg(feature = "native")]
27mod vector_builder;
28#[cfg(feature = "native")]
29mod writer;
30#[cfg(feature = "native")]
31pub use reader::IndexReader;
32#[cfg(feature = "native")]
33pub use writer::IndexWriter;
34
35mod metadata;
36pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
37
38#[cfg(feature = "native")]
39mod helpers;
40#[cfg(feature = "native")]
41pub use helpers::{
42 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
43 index_documents_from_reader, index_json_document, parse_schema,
44};
45
46pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
48
49#[derive(Debug, Clone)]
51pub struct IndexConfig {
52 pub num_threads: usize,
54 pub num_indexing_threads: usize,
56 pub num_compression_threads: usize,
58 pub term_cache_blocks: usize,
60 pub store_cache_blocks: usize,
62 pub max_indexing_memory_bytes: usize,
64 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
66 pub optimization: crate::structures::IndexOptimization,
68 pub reload_interval_ms: u64,
70}
71
72impl Default for IndexConfig {
73 fn default() -> Self {
74 #[cfg(feature = "native")]
75 let cpus = num_cpus::get().max(1);
76 #[cfg(not(feature = "native"))]
77 let cpus = 1;
78
79 Self {
80 num_threads: cpus,
81 num_indexing_threads: 1,
82 num_compression_threads: cpus,
83 term_cache_blocks: 256,
84 store_cache_blocks: 32,
85 max_indexing_memory_bytes: 256 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
87 optimization: crate::structures::IndexOptimization::default(),
88 reload_interval_ms: 1000, }
90 }
91}
92
93#[cfg(feature = "native")]
102pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
103 directory: Arc<D>,
104 schema: Arc<Schema>,
105 config: IndexConfig,
106 segment_manager: Arc<crate::merge::SegmentManager<D>>,
108 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
110 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
112 cached_reader: tokio::sync::OnceCell<IndexReader<D>>,
114}
115
116#[cfg(feature = "native")]
117impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
118 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
120 let directory = Arc::new(directory);
121 let schema = Arc::new(schema);
122 let metadata = IndexMetadata::new((*schema).clone());
123
124 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
125 Arc::clone(&directory),
126 Arc::clone(&schema),
127 metadata,
128 config.merge_policy.clone_box(),
129 config.term_cache_blocks,
130 ));
131
132 segment_manager.update_metadata(|_| {}).await?;
134
135 Ok(Self {
136 directory,
137 schema,
138 config,
139 segment_manager,
140 trained_centroids: FxHashMap::default(),
141 trained_codebooks: FxHashMap::default(),
142 cached_reader: tokio::sync::OnceCell::new(),
143 })
144 }
145
146 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
148 let directory = Arc::new(directory);
149
150 let metadata = IndexMetadata::load(directory.as_ref()).await?;
152 let schema = Arc::new(metadata.schema.clone());
153
154 let (trained_centroids, trained_codebooks) =
156 metadata.load_trained_structures(directory.as_ref()).await;
157
158 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
159 Arc::clone(&directory),
160 Arc::clone(&schema),
161 metadata,
162 config.merge_policy.clone_box(),
163 config.term_cache_blocks,
164 ));
165
166 Ok(Self {
167 directory,
168 schema,
169 config,
170 segment_manager,
171 trained_centroids,
172 trained_codebooks,
173 cached_reader: tokio::sync::OnceCell::new(),
174 })
175 }
176
177 pub fn schema(&self) -> &Schema {
179 &self.schema
180 }
181
182 pub fn directory(&self) -> &D {
184 &self.directory
185 }
186
187 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
189 &self.segment_manager
190 }
191
192 pub async fn reader(&self) -> Result<&IndexReader<D>> {
197 self.cached_reader
198 .get_or_try_init(|| async {
199 IndexReader::from_segment_manager_with_reload_interval(
200 Arc::clone(&self.schema),
201 Arc::clone(&self.segment_manager),
202 self.trained_centroids.clone(),
203 self.trained_codebooks.clone(),
204 self.config.term_cache_blocks,
205 self.config.reload_interval_ms,
206 )
207 .await
208 })
209 .await
210 }
211
212 pub fn config(&self) -> &IndexConfig {
214 &self.config
215 }
216
217 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
219 &self.trained_centroids
220 }
221
222 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
224 &self.trained_codebooks
225 }
226
227 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
229 let reader = self.reader().await?;
230 let searcher = reader.searcher().await?;
231 Ok(searcher.segment_readers().to_vec())
232 }
233
234 pub async fn num_docs(&self) -> Result<u32> {
236 let reader = self.reader().await?;
237 let searcher = reader.searcher().await?;
238 Ok(searcher.num_docs())
239 }
240
241 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
243 let reader = self.reader().await?;
244 let searcher = reader.searcher().await?;
245 searcher.doc(doc_id).await
246 }
247
248 pub fn default_fields(&self) -> Vec<crate::Field> {
250 if !self.schema.default_fields().is_empty() {
251 self.schema.default_fields().to_vec()
252 } else {
253 self.schema
254 .fields()
255 .filter(|(_, entry)| {
256 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
257 })
258 .map(|(field, _)| field)
259 .collect()
260 }
261 }
262
263 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
265 Arc::new(crate::tokenizer::TokenizerRegistry::default())
266 }
267
268 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
270 let default_fields = self.default_fields();
271 let tokenizers = self.tokenizers();
272
273 let query_routers = self.schema.query_routers();
274 if !query_routers.is_empty()
275 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
276 {
277 return crate::dsl::QueryLanguageParser::with_router(
278 Arc::clone(&self.schema),
279 default_fields,
280 tokenizers,
281 router,
282 );
283 }
284
285 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
286 }
287
288 pub async fn query(
290 &self,
291 query_str: &str,
292 limit: usize,
293 ) -> Result<crate::query::SearchResponse> {
294 self.query_offset(query_str, limit, 0).await
295 }
296
297 pub async fn query_offset(
299 &self,
300 query_str: &str,
301 limit: usize,
302 offset: usize,
303 ) -> Result<crate::query::SearchResponse> {
304 let parser = self.query_parser();
305 let query = parser
306 .parse(query_str)
307 .map_err(crate::error::Error::Query)?;
308 self.search_offset(query.as_ref(), limit, offset).await
309 }
310
311 pub async fn search(
313 &self,
314 query: &dyn crate::query::Query,
315 limit: usize,
316 ) -> Result<crate::query::SearchResponse> {
317 self.search_offset(query, limit, 0).await
318 }
319
320 pub async fn search_offset(
322 &self,
323 query: &dyn crate::query::Query,
324 limit: usize,
325 offset: usize,
326 ) -> Result<crate::query::SearchResponse> {
327 let reader = self.reader().await?;
328 let searcher = reader.searcher().await?;
329 let segments = searcher.segment_readers();
330
331 let fetch_limit = offset + limit;
332
333 let futures: Vec<_> = segments
334 .iter()
335 .map(|segment| {
336 let sid = segment.meta().id;
337 async move {
338 let results =
339 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
340 Ok::<_, crate::error::Error>(
341 results
342 .into_iter()
343 .map(move |r| (sid, r))
344 .collect::<Vec<_>>(),
345 )
346 }
347 })
348 .collect();
349
350 let batches = futures::future::try_join_all(futures).await?;
351 let mut all_results: Vec<(u128, crate::query::SearchResult)> =
352 Vec::with_capacity(batches.iter().map(|b| b.len()).sum());
353 for batch in batches {
354 all_results.extend(batch);
355 }
356
357 all_results.sort_by(|a, b| {
358 b.1.score
359 .partial_cmp(&a.1.score)
360 .unwrap_or(std::cmp::Ordering::Equal)
361 });
362
363 let total_hits = all_results.len() as u32;
364
365 let hits: Vec<crate::query::SearchHit> = all_results
366 .into_iter()
367 .skip(offset)
368 .take(limit)
369 .map(|(segment_id, result)| crate::query::SearchHit {
370 address: crate::query::DocAddress::new(segment_id, result.doc_id),
371 score: result.score,
372 matched_fields: result.extract_ordinals(),
373 })
374 .collect();
375
376 Ok(crate::query::SearchResponse { hits, total_hits })
377 }
378
379 pub async fn get_document(
381 &self,
382 address: &crate::query::DocAddress,
383 ) -> Result<Option<crate::dsl::Document>> {
384 let segment_id = address.segment_id_u128().ok_or_else(|| {
385 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
386 })?;
387
388 let reader = self.reader().await?;
389 let searcher = reader.searcher().await?;
390
391 for segment in searcher.segment_readers() {
392 if segment.meta().id == segment_id {
393 let local_doc_id = address.doc_id.wrapping_sub(segment.doc_id_offset());
395 return segment.doc(local_doc_id).await;
396 }
397 }
398
399 Ok(None)
400 }
401
402 pub async fn reload(&self) -> Result<()> {
404 Ok(())
406 }
407
408 pub async fn get_postings(
410 &self,
411 field: crate::Field,
412 term: &[u8],
413 ) -> Result<
414 Vec<(
415 Arc<crate::segment::SegmentReader>,
416 crate::structures::BlockPostingList,
417 )>,
418 > {
419 let segments = self.segment_readers().await?;
420 let mut results = Vec::new();
421
422 for segment in segments {
423 if let Some(postings) = segment.get_postings(field, term).await? {
424 results.push((segment, postings));
425 }
426 }
427
428 Ok(results)
429 }
430}
431
432#[cfg(feature = "native")]
434impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
435 pub fn writer(&self) -> writer::IndexWriter<D> {
437 writer::IndexWriter::from_index(self)
438 }
439}
440
441#[cfg(test)]
444mod tests {
445 use super::*;
446 use crate::directories::RamDirectory;
447 use crate::dsl::{Document, SchemaBuilder};
448
449 #[tokio::test]
450 async fn test_index_create_and_search() {
451 let mut schema_builder = SchemaBuilder::default();
452 let title = schema_builder.add_text_field("title", true, true);
453 let body = schema_builder.add_text_field("body", true, true);
454 let schema = schema_builder.build();
455
456 let dir = RamDirectory::new();
457 let config = IndexConfig::default();
458
459 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
461 .await
462 .unwrap();
463
464 let mut doc1 = Document::new();
465 doc1.add_text(title, "Hello World");
466 doc1.add_text(body, "This is the first document");
467 writer.add_document(doc1).unwrap();
468
469 let mut doc2 = Document::new();
470 doc2.add_text(title, "Goodbye World");
471 doc2.add_text(body, "This is the second document");
472 writer.add_document(doc2).unwrap();
473
474 writer.commit().await.unwrap();
475
476 let index = Index::open(dir, config).await.unwrap();
478 assert_eq!(index.num_docs().await.unwrap(), 2);
479
480 let postings = index.get_postings(title, b"world").await.unwrap();
482 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
487 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
488 }
489
490 #[tokio::test]
491 async fn test_multiple_segments() {
492 let mut schema_builder = SchemaBuilder::default();
493 let title = schema_builder.add_text_field("title", true, true);
494 let schema = schema_builder.build();
495
496 let dir = RamDirectory::new();
497 let config = IndexConfig {
498 max_indexing_memory_bytes: 1024, ..Default::default()
500 };
501
502 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
503 .await
504 .unwrap();
505
506 for batch in 0..3 {
508 for i in 0..5 {
509 let mut doc = Document::new();
510 doc.add_text(title, format!("Document {} batch {}", i, batch));
511 writer.add_document(doc).unwrap();
512 }
513 writer.commit().await.unwrap();
514 }
515
516 let index = Index::open(dir, config).await.unwrap();
518 assert_eq!(index.num_docs().await.unwrap(), 15);
519 assert!(
521 index.segment_readers().await.unwrap().len() >= 2,
522 "Expected multiple segments"
523 );
524 }
525
526 #[tokio::test]
527 async fn test_segment_merge() {
528 let mut schema_builder = SchemaBuilder::default();
529 let title = schema_builder.add_text_field("title", true, true);
530 let schema = schema_builder.build();
531
532 let dir = RamDirectory::new();
533 let config = IndexConfig {
534 max_indexing_memory_bytes: 512, ..Default::default()
536 };
537
538 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
539 .await
540 .unwrap();
541
542 for batch in 0..3 {
544 for i in 0..3 {
545 let mut doc = Document::new();
546 doc.add_text(title, format!("Document {} batch {}", i, batch));
547 writer.add_document(doc).unwrap();
548 }
549 writer.flush().await.unwrap();
550 }
551 writer.commit().await.unwrap();
552
553 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
555 assert!(
556 index.segment_readers().await.unwrap().len() >= 2,
557 "Expected multiple segments"
558 );
559
560 let writer = IndexWriter::open(dir.clone(), config.clone())
562 .await
563 .unwrap();
564 writer.force_merge().await.unwrap();
565
566 let index = Index::open(dir, config).await.unwrap();
568 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
569 assert_eq!(index.num_docs().await.unwrap(), 9);
570
571 let mut found_docs = 0;
573 for i in 0..9 {
574 if index.doc(i).await.unwrap().is_some() {
575 found_docs += 1;
576 }
577 }
578 assert_eq!(found_docs, 9);
579 }
580
581 #[tokio::test]
582 async fn test_match_query() {
583 let mut schema_builder = SchemaBuilder::default();
584 let title = schema_builder.add_text_field("title", true, true);
585 let body = schema_builder.add_text_field("body", true, true);
586 let schema = schema_builder.build();
587
588 let dir = RamDirectory::new();
589 let config = IndexConfig::default();
590
591 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
592 .await
593 .unwrap();
594
595 let mut doc1 = Document::new();
596 doc1.add_text(title, "rust programming");
597 doc1.add_text(body, "Learn rust language");
598 writer.add_document(doc1).unwrap();
599
600 let mut doc2 = Document::new();
601 doc2.add_text(title, "python programming");
602 doc2.add_text(body, "Learn python language");
603 writer.add_document(doc2).unwrap();
604
605 writer.commit().await.unwrap();
606
607 let index = Index::open(dir, config).await.unwrap();
608
609 let results = index.query("rust", 10).await.unwrap();
611 assert_eq!(results.hits.len(), 1);
612
613 let results = index.query("rust programming", 10).await.unwrap();
615 assert!(!results.hits.is_empty());
616
617 let hit = &results.hits[0];
619 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
620
621 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
623 assert!(
624 !doc.field_values().is_empty(),
625 "Doc should have field values"
626 );
627
628 let doc = index.doc(0).await.unwrap().unwrap();
630 assert!(
631 !doc.field_values().is_empty(),
632 "Doc should have field values"
633 );
634 }
635
636 #[tokio::test]
637 async fn test_slice_cache_warmup_and_load() {
638 use crate::directories::SliceCachingDirectory;
639
640 let mut schema_builder = SchemaBuilder::default();
641 let title = schema_builder.add_text_field("title", true, true);
642 let body = schema_builder.add_text_field("body", true, true);
643 let schema = schema_builder.build();
644
645 let dir = RamDirectory::new();
646 let config = IndexConfig::default();
647
648 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
650 .await
651 .unwrap();
652
653 for i in 0..10 {
654 let mut doc = Document::new();
655 doc.add_text(title, format!("Document {} about rust", i));
656 doc.add_text(body, format!("This is body text number {}", i));
657 writer.add_document(doc).unwrap();
658 }
659 writer.commit().await.unwrap();
660
661 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
663 let index = Index::open(caching_dir, config.clone()).await.unwrap();
664
665 let results = index.query("rust", 10).await.unwrap();
667 assert!(!results.hits.is_empty());
668
669 let stats = index.directory.stats();
671 assert!(stats.total_bytes > 0, "Cache should have data after search");
672 }
673
674 #[tokio::test]
675 async fn test_multivalue_field_indexing_and_search() {
676 let mut schema_builder = SchemaBuilder::default();
677 let uris = schema_builder.add_text_field("uris", true, true);
678 let title = schema_builder.add_text_field("title", true, true);
679 let schema = schema_builder.build();
680
681 let dir = RamDirectory::new();
682 let config = IndexConfig::default();
683
684 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
686 .await
687 .unwrap();
688
689 let mut doc = Document::new();
690 doc.add_text(uris, "one");
691 doc.add_text(uris, "two");
692 doc.add_text(title, "Test Document");
693 writer.add_document(doc).unwrap();
694
695 let mut doc2 = Document::new();
697 doc2.add_text(uris, "three");
698 doc2.add_text(title, "Another Document");
699 writer.add_document(doc2).unwrap();
700
701 writer.commit().await.unwrap();
702
703 let index = Index::open(dir, config).await.unwrap();
705 assert_eq!(index.num_docs().await.unwrap(), 2);
706
707 let doc = index.doc(0).await.unwrap().unwrap();
709 let all_uris: Vec<_> = doc.get_all(uris).collect();
710 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
711 assert_eq!(all_uris[0].as_text(), Some("one"));
712 assert_eq!(all_uris[1].as_text(), Some("two"));
713
714 let json = doc.to_json(index.schema());
716 let uris_json = json.get("uris").unwrap();
717 assert!(uris_json.is_array(), "Multi-value field should be an array");
718 let uris_arr = uris_json.as_array().unwrap();
719 assert_eq!(uris_arr.len(), 2);
720 assert_eq!(uris_arr[0].as_str(), Some("one"));
721 assert_eq!(uris_arr[1].as_str(), Some("two"));
722
723 let results = index.query("uris:one", 10).await.unwrap();
725 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
726 assert_eq!(results.hits[0].address.doc_id, 0);
727
728 let results = index.query("uris:two", 10).await.unwrap();
729 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
730 assert_eq!(results.hits[0].address.doc_id, 0);
731
732 let results = index.query("uris:three", 10).await.unwrap();
733 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
734 assert_eq!(results.hits[0].address.doc_id, 1);
735
736 let results = index.query("uris:nonexistent", 10).await.unwrap();
738 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
739 }
740
741 #[tokio::test]
748 async fn test_wand_optimization_for_or_queries() {
749 use crate::query::{BooleanQuery, TermQuery};
750
751 let mut schema_builder = SchemaBuilder::default();
752 let content = schema_builder.add_text_field("content", true, true);
753 let schema = schema_builder.build();
754
755 let dir = RamDirectory::new();
756 let config = IndexConfig::default();
757
758 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
760 .await
761 .unwrap();
762
763 let mut doc = Document::new();
765 doc.add_text(content, "rust programming language is fast");
766 writer.add_document(doc).unwrap();
767
768 let mut doc = Document::new();
770 doc.add_text(content, "rust is a systems language");
771 writer.add_document(doc).unwrap();
772
773 let mut doc = Document::new();
775 doc.add_text(content, "programming is fun");
776 writer.add_document(doc).unwrap();
777
778 let mut doc = Document::new();
780 doc.add_text(content, "python is easy to learn");
781 writer.add_document(doc).unwrap();
782
783 let mut doc = Document::new();
785 doc.add_text(content, "rust rust programming programming systems");
786 writer.add_document(doc).unwrap();
787
788 writer.commit().await.unwrap();
789
790 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
792
793 let or_query = BooleanQuery::new()
795 .should(TermQuery::text(content, "rust"))
796 .should(TermQuery::text(content, "programming"));
797
798 let results = index.search(&or_query, 10).await.unwrap();
799
800 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
802
803 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
804 assert!(doc_ids.contains(&0), "Should find doc 0");
805 assert!(doc_ids.contains(&1), "Should find doc 1");
806 assert!(doc_ids.contains(&2), "Should find doc 2");
807 assert!(doc_ids.contains(&4), "Should find doc 4");
808 assert!(
809 !doc_ids.contains(&3),
810 "Should NOT find doc 3 (only has 'python')"
811 );
812
813 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
815
816 let results = index.search(&single_query, 10).await.unwrap();
817 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
818
819 let must_query = BooleanQuery::new()
821 .must(TermQuery::text(content, "rust"))
822 .should(TermQuery::text(content, "programming"));
823
824 let results = index.search(&must_query, 10).await.unwrap();
825 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
827
828 let must_not_query = BooleanQuery::new()
830 .should(TermQuery::text(content, "rust"))
831 .should(TermQuery::text(content, "programming"))
832 .must_not(TermQuery::text(content, "systems"));
833
834 let results = index.search(&must_not_query, 10).await.unwrap();
835 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
837 assert!(
838 !doc_ids.contains(&1),
839 "Should NOT find doc 1 (has 'systems')"
840 );
841 assert!(
842 !doc_ids.contains(&4),
843 "Should NOT find doc 4 (has 'systems')"
844 );
845
846 let or_query = BooleanQuery::new()
848 .should(TermQuery::text(content, "rust"))
849 .should(TermQuery::text(content, "programming"));
850
851 let results = index.search(&or_query, 2).await.unwrap();
852 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
853
854 }
857
858 #[tokio::test]
860 async fn test_wand_results_match_standard_boolean() {
861 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
862
863 let mut schema_builder = SchemaBuilder::default();
864 let content = schema_builder.add_text_field("content", true, true);
865 let schema = schema_builder.build();
866
867 let dir = RamDirectory::new();
868 let config = IndexConfig::default();
869
870 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
871 .await
872 .unwrap();
873
874 for i in 0..10 {
876 let mut doc = Document::new();
877 let text = match i % 4 {
878 0 => "apple banana cherry",
879 1 => "apple orange",
880 2 => "banana grape",
881 _ => "cherry date",
882 };
883 doc.add_text(content, text);
884 writer.add_document(doc).unwrap();
885 }
886
887 writer.commit().await.unwrap();
888 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
889
890 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
892
893 let bool_query = BooleanQuery::new()
894 .should(TermQuery::text(content, "apple"))
895 .should(TermQuery::text(content, "banana"));
896
897 let wand_results = index.search(&wand_query, 10).await.unwrap();
898 let bool_results = index.search(&bool_query, 10).await.unwrap();
899
900 assert_eq!(
902 wand_results.hits.len(),
903 bool_results.hits.len(),
904 "WAND and Boolean should find same number of docs"
905 );
906
907 let wand_docs: std::collections::HashSet<u32> =
908 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
909 let bool_docs: std::collections::HashSet<u32> =
910 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
911
912 assert_eq!(
913 wand_docs, bool_docs,
914 "WAND and Boolean should find same documents"
915 );
916 }
917
918 #[tokio::test]
919 async fn test_vector_index_threshold_switch() {
920 use crate::dsl::{DenseVectorConfig, VectorIndexType};
921
922 let mut schema_builder = SchemaBuilder::default();
924 let title = schema_builder.add_text_field("title", true, true);
925 let embedding = schema_builder.add_dense_vector_field_with_config(
926 "embedding",
927 true, true, DenseVectorConfig {
930 dim: 8,
931 index_type: VectorIndexType::IvfRaBitQ,
932 store_raw: true,
933 num_clusters: Some(4), nprobe: 2,
935 mrl_dim: None,
936 build_threshold: Some(50), },
938 );
939 let schema = schema_builder.build();
940
941 let dir = RamDirectory::new();
942 let config = IndexConfig::default();
943
944 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
946 .await
947 .unwrap();
948
949 for i in 0..30 {
951 let mut doc = Document::new();
952 doc.add_text(title, format!("Document {}", i));
953 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
955 doc.add_dense_vector(embedding, vec);
956 writer.add_document(doc).unwrap();
957 }
958 writer.commit().await.unwrap();
959
960 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
962 assert!(
963 index.trained_centroids.is_empty(),
964 "Should not have trained centroids below threshold"
965 );
966
967 let query_vec: Vec<f32> = vec![0.5; 8];
969 let segments = index.segment_readers().await.unwrap();
970 assert!(!segments.is_empty());
971
972 let results = segments[0]
973 .search_dense_vector(
974 embedding,
975 &query_vec,
976 5,
977 1,
978 crate::query::MultiValueCombiner::Max,
979 )
980 .unwrap();
981 assert!(!results.is_empty(), "Flat search should return results");
982
983 let writer = IndexWriter::open(dir.clone(), config.clone())
985 .await
986 .unwrap();
987
988 for i in 30..60 {
990 let mut doc = Document::new();
991 doc.add_text(title, format!("Document {}", i));
992 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
993 doc.add_dense_vector(embedding, vec);
994 writer.add_document(doc).unwrap();
995 }
996 writer.commit().await.unwrap();
998
999 assert!(
1001 writer.is_vector_index_built(embedding).await,
1002 "Vector index should be built after crossing threshold"
1003 );
1004
1005 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
1007 assert!(
1008 index.trained_centroids.contains_key(&embedding.0),
1009 "Should have loaded trained centroids for embedding field"
1010 );
1011
1012 let segments = index.segment_readers().await.unwrap();
1014 let results = segments[0]
1015 .search_dense_vector(
1016 embedding,
1017 &query_vec,
1018 5,
1019 1,
1020 crate::query::MultiValueCombiner::Max,
1021 )
1022 .unwrap();
1023 assert!(
1024 !results.is_empty(),
1025 "Search should return results after build"
1026 );
1027
1028 let writer = IndexWriter::open(dir.clone(), config.clone())
1030 .await
1031 .unwrap();
1032 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1036 }
1037}