1#[cfg(feature = "native")]
11use crate::dsl::Schema;
12#[cfg(feature = "native")]
13use crate::error::Result;
14#[cfg(feature = "native")]
15use crate::structures::{CoarseCentroids, PQCodebook};
16#[cfg(feature = "native")]
17use rustc_hash::FxHashMap;
18#[cfg(feature = "native")]
19use std::sync::Arc;
20
21mod searcher;
22pub use searcher::Searcher;
23
24#[cfg(feature = "native")]
25mod reader;
26#[cfg(feature = "native")]
27mod vector_builder;
28#[cfg(feature = "native")]
29mod writer;
30#[cfg(feature = "native")]
31pub use reader::IndexReader;
32#[cfg(feature = "native")]
33pub use writer::IndexWriter;
34
35mod metadata;
36pub use metadata::{FieldVectorMeta, INDEX_META_FILENAME, IndexMetadata, VectorIndexState};
37
38#[cfg(feature = "native")]
39mod helpers;
40#[cfg(feature = "native")]
41pub use helpers::{
42 IndexingStats, SchemaConfig, SchemaFieldConfig, create_index_at_path, create_index_from_sdl,
43 index_documents_from_reader, index_json_document, parse_schema,
44};
45
46pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
48
49#[derive(Debug, Clone)]
51pub struct IndexConfig {
52 pub num_threads: usize,
54 pub num_indexing_threads: usize,
56 pub num_compression_threads: usize,
58 pub term_cache_blocks: usize,
60 pub store_cache_blocks: usize,
62 pub max_indexing_memory_bytes: usize,
64 pub merge_policy: Box<dyn crate::merge::MergePolicy>,
66 pub optimization: crate::structures::IndexOptimization,
68 pub reload_interval_ms: u64,
70}
71
72impl Default for IndexConfig {
73 fn default() -> Self {
74 #[cfg(feature = "native")]
75 let cpus = num_cpus::get().max(1);
76 #[cfg(not(feature = "native"))]
77 let cpus = 1;
78
79 Self {
80 num_threads: cpus,
81 num_indexing_threads: 1,
82 num_compression_threads: cpus,
83 term_cache_blocks: 256,
84 store_cache_blocks: 32,
85 max_indexing_memory_bytes: 256 * 1024 * 1024, merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
87 optimization: crate::structures::IndexOptimization::default(),
88 reload_interval_ms: 1000, }
90 }
91}
92
93#[cfg(feature = "native")]
102pub struct Index<D: crate::directories::DirectoryWriter + 'static> {
103 directory: Arc<D>,
104 schema: Arc<Schema>,
105 config: IndexConfig,
106 segment_manager: Arc<crate::merge::SegmentManager<D>>,
108 trained_centroids: FxHashMap<u32, Arc<CoarseCentroids>>,
110 trained_codebooks: FxHashMap<u32, Arc<PQCodebook>>,
112 cached_reader: tokio::sync::OnceCell<IndexReader<D>>,
114}
115
116#[cfg(feature = "native")]
117impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
118 pub async fn create(directory: D, schema: Schema, config: IndexConfig) -> Result<Self> {
120 let directory = Arc::new(directory);
121 let schema = Arc::new(schema);
122 let metadata = IndexMetadata::new((*schema).clone());
123
124 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
125 Arc::clone(&directory),
126 Arc::clone(&schema),
127 metadata,
128 config.merge_policy.clone_box(),
129 config.term_cache_blocks,
130 ));
131
132 segment_manager.update_metadata(|_| {}).await?;
134
135 Ok(Self {
136 directory,
137 schema,
138 config,
139 segment_manager,
140 trained_centroids: FxHashMap::default(),
141 trained_codebooks: FxHashMap::default(),
142 cached_reader: tokio::sync::OnceCell::new(),
143 })
144 }
145
146 pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
148 let directory = Arc::new(directory);
149
150 let metadata = IndexMetadata::load(directory.as_ref()).await?;
152 let schema = Arc::new(metadata.schema.clone());
153
154 let trained = metadata.load_trained_structures(directory.as_ref()).await;
156 let trained_centroids = trained
157 .as_ref()
158 .map(|t| t.centroids.clone())
159 .unwrap_or_default();
160 let trained_codebooks = trained
161 .as_ref()
162 .map(|t| t.codebooks.clone())
163 .unwrap_or_default();
164
165 log::info!(
166 "[Index::open] trained_centroids fields={:?}, trained_codebooks fields={:?}",
167 trained_centroids.keys().collect::<Vec<_>>(),
168 trained_codebooks.keys().collect::<Vec<_>>(),
169 );
170
171 let segment_manager = Arc::new(crate::merge::SegmentManager::new(
172 Arc::clone(&directory),
173 Arc::clone(&schema),
174 metadata,
175 config.merge_policy.clone_box(),
176 config.term_cache_blocks,
177 ));
178
179 Ok(Self {
180 directory,
181 schema,
182 config,
183 segment_manager,
184 trained_centroids,
185 trained_codebooks,
186 cached_reader: tokio::sync::OnceCell::new(),
187 })
188 }
189
190 pub fn schema(&self) -> &Schema {
192 &self.schema
193 }
194
195 pub fn directory(&self) -> &D {
197 &self.directory
198 }
199
200 pub fn segment_manager(&self) -> &Arc<crate::merge::SegmentManager<D>> {
202 &self.segment_manager
203 }
204
205 pub async fn reader(&self) -> Result<&IndexReader<D>> {
210 self.cached_reader
211 .get_or_try_init(|| async {
212 IndexReader::from_segment_manager_with_reload_interval(
213 Arc::clone(&self.schema),
214 Arc::clone(&self.segment_manager),
215 self.config.term_cache_blocks,
216 self.config.reload_interval_ms,
217 )
218 .await
219 })
220 .await
221 }
222
223 pub fn config(&self) -> &IndexConfig {
225 &self.config
226 }
227
228 pub fn trained_centroids(&self) -> &FxHashMap<u32, Arc<CoarseCentroids>> {
230 &self.trained_centroids
231 }
232
233 pub fn trained_codebooks(&self) -> &FxHashMap<u32, Arc<PQCodebook>> {
235 &self.trained_codebooks
236 }
237
238 pub async fn segment_readers(&self) -> Result<Vec<Arc<crate::segment::SegmentReader>>> {
240 let reader = self.reader().await?;
241 let searcher = reader.searcher().await?;
242 Ok(searcher.segment_readers().to_vec())
243 }
244
245 pub async fn num_docs(&self) -> Result<u32> {
247 let reader = self.reader().await?;
248 let searcher = reader.searcher().await?;
249 Ok(searcher.num_docs())
250 }
251
252 pub async fn doc(&self, doc_id: crate::DocId) -> Result<Option<crate::dsl::Document>> {
254 let reader = self.reader().await?;
255 let searcher = reader.searcher().await?;
256 searcher.doc(doc_id).await
257 }
258
259 pub fn default_fields(&self) -> Vec<crate::Field> {
261 if !self.schema.default_fields().is_empty() {
262 self.schema.default_fields().to_vec()
263 } else {
264 self.schema
265 .fields()
266 .filter(|(_, entry)| {
267 entry.indexed && entry.field_type == crate::dsl::FieldType::Text
268 })
269 .map(|(field, _)| field)
270 .collect()
271 }
272 }
273
274 pub fn tokenizers(&self) -> Arc<crate::tokenizer::TokenizerRegistry> {
276 Arc::new(crate::tokenizer::TokenizerRegistry::default())
277 }
278
279 pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
281 let default_fields = self.default_fields();
282 let tokenizers = self.tokenizers();
283
284 let query_routers = self.schema.query_routers();
285 if !query_routers.is_empty()
286 && let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers)
287 {
288 return crate::dsl::QueryLanguageParser::with_router(
289 Arc::clone(&self.schema),
290 default_fields,
291 tokenizers,
292 router,
293 );
294 }
295
296 crate::dsl::QueryLanguageParser::new(Arc::clone(&self.schema), default_fields, tokenizers)
297 }
298
299 pub async fn query(
301 &self,
302 query_str: &str,
303 limit: usize,
304 ) -> Result<crate::query::SearchResponse> {
305 self.query_offset(query_str, limit, 0).await
306 }
307
308 pub async fn query_offset(
310 &self,
311 query_str: &str,
312 limit: usize,
313 offset: usize,
314 ) -> Result<crate::query::SearchResponse> {
315 let parser = self.query_parser();
316 let query = parser
317 .parse(query_str)
318 .map_err(crate::error::Error::Query)?;
319 self.search_offset(query.as_ref(), limit, offset).await
320 }
321
322 pub async fn search(
324 &self,
325 query: &dyn crate::query::Query,
326 limit: usize,
327 ) -> Result<crate::query::SearchResponse> {
328 self.search_offset(query, limit, 0).await
329 }
330
331 pub async fn search_offset(
333 &self,
334 query: &dyn crate::query::Query,
335 limit: usize,
336 offset: usize,
337 ) -> Result<crate::query::SearchResponse> {
338 let reader = self.reader().await?;
339 let searcher = reader.searcher().await?;
340 let segments = searcher.segment_readers();
341
342 let fetch_limit = offset + limit;
343
344 let futures: Vec<_> = segments
345 .iter()
346 .map(|segment| {
347 let sid = segment.meta().id;
348 async move {
349 let results =
350 crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
351 Ok::<_, crate::error::Error>(
352 results
353 .into_iter()
354 .map(move |r| (sid, r))
355 .collect::<Vec<_>>(),
356 )
357 }
358 })
359 .collect();
360
361 let batches = futures::future::try_join_all(futures).await?;
362 let mut all_results: Vec<(u128, crate::query::SearchResult)> =
363 Vec::with_capacity(batches.iter().map(|b| b.len()).sum());
364 for batch in batches {
365 all_results.extend(batch);
366 }
367
368 all_results.sort_by(|a, b| {
369 b.1.score
370 .partial_cmp(&a.1.score)
371 .unwrap_or(std::cmp::Ordering::Equal)
372 });
373
374 let total_hits = all_results.len() as u32;
375
376 let hits: Vec<crate::query::SearchHit> = all_results
377 .into_iter()
378 .skip(offset)
379 .take(limit)
380 .map(|(segment_id, result)| crate::query::SearchHit {
381 address: crate::query::DocAddress::new(segment_id, result.doc_id),
382 score: result.score,
383 matched_fields: result.extract_ordinals(),
384 })
385 .collect();
386
387 Ok(crate::query::SearchResponse { hits, total_hits })
388 }
389
390 pub async fn get_document(
392 &self,
393 address: &crate::query::DocAddress,
394 ) -> Result<Option<crate::dsl::Document>> {
395 let segment_id = address.segment_id_u128().ok_or_else(|| {
396 crate::error::Error::Query(format!("Invalid segment ID: {}", address.segment_id))
397 })?;
398
399 let reader = self.reader().await?;
400 let searcher = reader.searcher().await?;
401
402 for segment in searcher.segment_readers() {
403 if segment.meta().id == segment_id {
404 let local_doc_id = address.doc_id.wrapping_sub(segment.doc_id_offset());
406 return segment.doc(local_doc_id).await;
407 }
408 }
409
410 Ok(None)
411 }
412
413 pub async fn reload(&self) -> Result<()> {
415 Ok(())
417 }
418
419 pub async fn get_postings(
421 &self,
422 field: crate::Field,
423 term: &[u8],
424 ) -> Result<
425 Vec<(
426 Arc<crate::segment::SegmentReader>,
427 crate::structures::BlockPostingList,
428 )>,
429 > {
430 let segments = self.segment_readers().await?;
431 let mut results = Vec::new();
432
433 for segment in segments {
434 if let Some(postings) = segment.get_postings(field, term).await? {
435 results.push((segment, postings));
436 }
437 }
438
439 Ok(results)
440 }
441}
442
443#[cfg(feature = "native")]
445impl<D: crate::directories::DirectoryWriter + 'static> Index<D> {
446 pub fn writer(&self) -> writer::IndexWriter<D> {
448 writer::IndexWriter::from_index(self)
449 }
450}
451
452#[cfg(test)]
455mod tests {
456 use super::*;
457 use crate::directories::RamDirectory;
458 use crate::dsl::{Document, SchemaBuilder};
459
460 #[tokio::test]
461 async fn test_index_create_and_search() {
462 let mut schema_builder = SchemaBuilder::default();
463 let title = schema_builder.add_text_field("title", true, true);
464 let body = schema_builder.add_text_field("body", true, true);
465 let schema = schema_builder.build();
466
467 let dir = RamDirectory::new();
468 let config = IndexConfig::default();
469
470 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
472 .await
473 .unwrap();
474
475 let mut doc1 = Document::new();
476 doc1.add_text(title, "Hello World");
477 doc1.add_text(body, "This is the first document");
478 writer.add_document(doc1).unwrap();
479
480 let mut doc2 = Document::new();
481 doc2.add_text(title, "Goodbye World");
482 doc2.add_text(body, "This is the second document");
483 writer.add_document(doc2).unwrap();
484
485 writer.commit().await.unwrap();
486
487 let index = Index::open(dir, config).await.unwrap();
489 assert_eq!(index.num_docs().await.unwrap(), 2);
490
491 let postings = index.get_postings(title, b"world").await.unwrap();
493 assert_eq!(postings.len(), 1); assert_eq!(postings[0].1.doc_count(), 2); let doc = index.doc(0).await.unwrap().unwrap();
498 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
499 }
500
501 #[tokio::test]
502 async fn test_multiple_segments() {
503 let mut schema_builder = SchemaBuilder::default();
504 let title = schema_builder.add_text_field("title", true, true);
505 let schema = schema_builder.build();
506
507 let dir = RamDirectory::new();
508 let config = IndexConfig {
509 max_indexing_memory_bytes: 1024, ..Default::default()
511 };
512
513 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
514 .await
515 .unwrap();
516
517 for batch in 0..3 {
519 for i in 0..5 {
520 let mut doc = Document::new();
521 doc.add_text(title, format!("Document {} batch {}", i, batch));
522 writer.add_document(doc).unwrap();
523 }
524 writer.commit().await.unwrap();
525 }
526
527 let index = Index::open(dir, config).await.unwrap();
529 assert_eq!(index.num_docs().await.unwrap(), 15);
530 assert!(
532 index.segment_readers().await.unwrap().len() >= 2,
533 "Expected multiple segments"
534 );
535 }
536
537 #[tokio::test]
538 async fn test_segment_merge() {
539 let mut schema_builder = SchemaBuilder::default();
540 let title = schema_builder.add_text_field("title", true, true);
541 let schema = schema_builder.build();
542
543 let dir = RamDirectory::new();
544 let config = IndexConfig {
545 max_indexing_memory_bytes: 512, ..Default::default()
547 };
548
549 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
550 .await
551 .unwrap();
552
553 for batch in 0..3 {
555 for i in 0..3 {
556 let mut doc = Document::new();
557 doc.add_text(title, format!("Document {} batch {}", i, batch));
558 writer.add_document(doc).unwrap();
559 }
560 writer.flush().await.unwrap();
561 }
562 writer.commit().await.unwrap();
563
564 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
566 assert!(
567 index.segment_readers().await.unwrap().len() >= 2,
568 "Expected multiple segments"
569 );
570
571 let writer = IndexWriter::open(dir.clone(), config.clone())
573 .await
574 .unwrap();
575 writer.force_merge().await.unwrap();
576
577 let index = Index::open(dir, config).await.unwrap();
579 assert_eq!(index.segment_readers().await.unwrap().len(), 1);
580 assert_eq!(index.num_docs().await.unwrap(), 9);
581
582 let mut found_docs = 0;
584 for i in 0..9 {
585 if index.doc(i).await.unwrap().is_some() {
586 found_docs += 1;
587 }
588 }
589 assert_eq!(found_docs, 9);
590 }
591
592 #[tokio::test]
593 async fn test_match_query() {
594 let mut schema_builder = SchemaBuilder::default();
595 let title = schema_builder.add_text_field("title", true, true);
596 let body = schema_builder.add_text_field("body", true, true);
597 let schema = schema_builder.build();
598
599 let dir = RamDirectory::new();
600 let config = IndexConfig::default();
601
602 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
603 .await
604 .unwrap();
605
606 let mut doc1 = Document::new();
607 doc1.add_text(title, "rust programming");
608 doc1.add_text(body, "Learn rust language");
609 writer.add_document(doc1).unwrap();
610
611 let mut doc2 = Document::new();
612 doc2.add_text(title, "python programming");
613 doc2.add_text(body, "Learn python language");
614 writer.add_document(doc2).unwrap();
615
616 writer.commit().await.unwrap();
617
618 let index = Index::open(dir, config).await.unwrap();
619
620 let results = index.query("rust", 10).await.unwrap();
622 assert_eq!(results.hits.len(), 1);
623
624 let results = index.query("rust programming", 10).await.unwrap();
626 assert!(!results.hits.is_empty());
627
628 let hit = &results.hits[0];
630 assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
631
632 let doc = index.get_document(&hit.address).await.unwrap().unwrap();
634 assert!(
635 !doc.field_values().is_empty(),
636 "Doc should have field values"
637 );
638
639 let doc = index.doc(0).await.unwrap().unwrap();
641 assert!(
642 !doc.field_values().is_empty(),
643 "Doc should have field values"
644 );
645 }
646
647 #[tokio::test]
648 async fn test_slice_cache_warmup_and_load() {
649 use crate::directories::SliceCachingDirectory;
650
651 let mut schema_builder = SchemaBuilder::default();
652 let title = schema_builder.add_text_field("title", true, true);
653 let body = schema_builder.add_text_field("body", true, true);
654 let schema = schema_builder.build();
655
656 let dir = RamDirectory::new();
657 let config = IndexConfig::default();
658
659 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
661 .await
662 .unwrap();
663
664 for i in 0..10 {
665 let mut doc = Document::new();
666 doc.add_text(title, format!("Document {} about rust", i));
667 doc.add_text(body, format!("This is body text number {}", i));
668 writer.add_document(doc).unwrap();
669 }
670 writer.commit().await.unwrap();
671
672 let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
674 let index = Index::open(caching_dir, config.clone()).await.unwrap();
675
676 let results = index.query("rust", 10).await.unwrap();
678 assert!(!results.hits.is_empty());
679
680 let stats = index.directory.stats();
682 assert!(stats.total_bytes > 0, "Cache should have data after search");
683 }
684
685 #[tokio::test]
686 async fn test_multivalue_field_indexing_and_search() {
687 let mut schema_builder = SchemaBuilder::default();
688 let uris = schema_builder.add_text_field("uris", true, true);
689 let title = schema_builder.add_text_field("title", true, true);
690 let schema = schema_builder.build();
691
692 let dir = RamDirectory::new();
693 let config = IndexConfig::default();
694
695 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
697 .await
698 .unwrap();
699
700 let mut doc = Document::new();
701 doc.add_text(uris, "one");
702 doc.add_text(uris, "two");
703 doc.add_text(title, "Test Document");
704 writer.add_document(doc).unwrap();
705
706 let mut doc2 = Document::new();
708 doc2.add_text(uris, "three");
709 doc2.add_text(title, "Another Document");
710 writer.add_document(doc2).unwrap();
711
712 writer.commit().await.unwrap();
713
714 let index = Index::open(dir, config).await.unwrap();
716 assert_eq!(index.num_docs().await.unwrap(), 2);
717
718 let doc = index.doc(0).await.unwrap().unwrap();
720 let all_uris: Vec<_> = doc.get_all(uris).collect();
721 assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
722 assert_eq!(all_uris[0].as_text(), Some("one"));
723 assert_eq!(all_uris[1].as_text(), Some("two"));
724
725 let json = doc.to_json(index.schema());
727 let uris_json = json.get("uris").unwrap();
728 assert!(uris_json.is_array(), "Multi-value field should be an array");
729 let uris_arr = uris_json.as_array().unwrap();
730 assert_eq!(uris_arr.len(), 2);
731 assert_eq!(uris_arr[0].as_str(), Some("one"));
732 assert_eq!(uris_arr[1].as_str(), Some("two"));
733
734 let results = index.query("uris:one", 10).await.unwrap();
736 assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
737 assert_eq!(results.hits[0].address.doc_id, 0);
738
739 let results = index.query("uris:two", 10).await.unwrap();
740 assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
741 assert_eq!(results.hits[0].address.doc_id, 0);
742
743 let results = index.query("uris:three", 10).await.unwrap();
744 assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
745 assert_eq!(results.hits[0].address.doc_id, 1);
746
747 let results = index.query("uris:nonexistent", 10).await.unwrap();
749 assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
750 }
751
752 #[tokio::test]
759 async fn test_wand_optimization_for_or_queries() {
760 use crate::query::{BooleanQuery, TermQuery};
761
762 let mut schema_builder = SchemaBuilder::default();
763 let content = schema_builder.add_text_field("content", true, true);
764 let schema = schema_builder.build();
765
766 let dir = RamDirectory::new();
767 let config = IndexConfig::default();
768
769 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
771 .await
772 .unwrap();
773
774 let mut doc = Document::new();
776 doc.add_text(content, "rust programming language is fast");
777 writer.add_document(doc).unwrap();
778
779 let mut doc = Document::new();
781 doc.add_text(content, "rust is a systems language");
782 writer.add_document(doc).unwrap();
783
784 let mut doc = Document::new();
786 doc.add_text(content, "programming is fun");
787 writer.add_document(doc).unwrap();
788
789 let mut doc = Document::new();
791 doc.add_text(content, "python is easy to learn");
792 writer.add_document(doc).unwrap();
793
794 let mut doc = Document::new();
796 doc.add_text(content, "rust rust programming programming systems");
797 writer.add_document(doc).unwrap();
798
799 writer.commit().await.unwrap();
800
801 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
803
804 let or_query = BooleanQuery::new()
806 .should(TermQuery::text(content, "rust"))
807 .should(TermQuery::text(content, "programming"));
808
809 let results = index.search(&or_query, 10).await.unwrap();
810
811 assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
813
814 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
815 assert!(doc_ids.contains(&0), "Should find doc 0");
816 assert!(doc_ids.contains(&1), "Should find doc 1");
817 assert!(doc_ids.contains(&2), "Should find doc 2");
818 assert!(doc_ids.contains(&4), "Should find doc 4");
819 assert!(
820 !doc_ids.contains(&3),
821 "Should NOT find doc 3 (only has 'python')"
822 );
823
824 let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
826
827 let results = index.search(&single_query, 10).await.unwrap();
828 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
829
830 let must_query = BooleanQuery::new()
832 .must(TermQuery::text(content, "rust"))
833 .should(TermQuery::text(content, "programming"));
834
835 let results = index.search(&must_query, 10).await.unwrap();
836 assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
838
839 let must_not_query = BooleanQuery::new()
841 .should(TermQuery::text(content, "rust"))
842 .should(TermQuery::text(content, "programming"))
843 .must_not(TermQuery::text(content, "systems"));
844
845 let results = index.search(&must_not_query, 10).await.unwrap();
846 let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
848 assert!(
849 !doc_ids.contains(&1),
850 "Should NOT find doc 1 (has 'systems')"
851 );
852 assert!(
853 !doc_ids.contains(&4),
854 "Should NOT find doc 4 (has 'systems')"
855 );
856
857 let or_query = BooleanQuery::new()
859 .should(TermQuery::text(content, "rust"))
860 .should(TermQuery::text(content, "programming"));
861
862 let results = index.search(&or_query, 2).await.unwrap();
863 assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
864
865 }
868
869 #[tokio::test]
871 async fn test_wand_results_match_standard_boolean() {
872 use crate::query::{BooleanQuery, TermQuery, WandOrQuery};
873
874 let mut schema_builder = SchemaBuilder::default();
875 let content = schema_builder.add_text_field("content", true, true);
876 let schema = schema_builder.build();
877
878 let dir = RamDirectory::new();
879 let config = IndexConfig::default();
880
881 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
882 .await
883 .unwrap();
884
885 for i in 0..10 {
887 let mut doc = Document::new();
888 let text = match i % 4 {
889 0 => "apple banana cherry",
890 1 => "apple orange",
891 2 => "banana grape",
892 _ => "cherry date",
893 };
894 doc.add_text(content, text);
895 writer.add_document(doc).unwrap();
896 }
897
898 writer.commit().await.unwrap();
899 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
900
901 let wand_query = WandOrQuery::new(content).term("apple").term("banana");
903
904 let bool_query = BooleanQuery::new()
905 .should(TermQuery::text(content, "apple"))
906 .should(TermQuery::text(content, "banana"));
907
908 let wand_results = index.search(&wand_query, 10).await.unwrap();
909 let bool_results = index.search(&bool_query, 10).await.unwrap();
910
911 assert_eq!(
913 wand_results.hits.len(),
914 bool_results.hits.len(),
915 "WAND and Boolean should find same number of docs"
916 );
917
918 let wand_docs: std::collections::HashSet<u32> =
919 wand_results.hits.iter().map(|h| h.address.doc_id).collect();
920 let bool_docs: std::collections::HashSet<u32> =
921 bool_results.hits.iter().map(|h| h.address.doc_id).collect();
922
923 assert_eq!(
924 wand_docs, bool_docs,
925 "WAND and Boolean should find same documents"
926 );
927 }
928
929 #[tokio::test]
930 async fn test_vector_index_threshold_switch() {
931 use crate::dsl::{DenseVectorConfig, DenseVectorQuantization, VectorIndexType};
932
933 let mut schema_builder = SchemaBuilder::default();
935 let title = schema_builder.add_text_field("title", true, true);
936 let embedding = schema_builder.add_dense_vector_field_with_config(
937 "embedding",
938 true, true, DenseVectorConfig {
941 dim: 8,
942 index_type: VectorIndexType::IvfRaBitQ,
943 quantization: DenseVectorQuantization::F32,
944 num_clusters: Some(4), nprobe: 2,
946 build_threshold: Some(50), },
948 );
949 let schema = schema_builder.build();
950
951 let dir = RamDirectory::new();
952 let config = IndexConfig::default();
953
954 let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
956 .await
957 .unwrap();
958
959 for i in 0..30 {
961 let mut doc = Document::new();
962 doc.add_text(title, format!("Document {}", i));
963 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
965 doc.add_dense_vector(embedding, vec);
966 writer.add_document(doc).unwrap();
967 }
968 writer.commit().await.unwrap();
969
970 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
972 assert!(
973 index.trained_centroids.is_empty(),
974 "Should not have trained centroids below threshold"
975 );
976
977 let query_vec: Vec<f32> = vec![0.5; 8];
979 let segments = index.segment_readers().await.unwrap();
980 assert!(!segments.is_empty());
981
982 let results = segments[0]
983 .search_dense_vector(
984 embedding,
985 &query_vec,
986 5,
987 0,
988 1,
989 crate::query::MultiValueCombiner::Max,
990 )
991 .await
992 .unwrap();
993 assert!(!results.is_empty(), "Flat search should return results");
994
995 let writer = IndexWriter::open(dir.clone(), config.clone())
997 .await
998 .unwrap();
999
1000 for i in 30..60 {
1002 let mut doc = Document::new();
1003 doc.add_text(title, format!("Document {}", i));
1004 let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
1005 doc.add_dense_vector(embedding, vec);
1006 writer.add_document(doc).unwrap();
1007 }
1008 writer.commit().await.unwrap();
1010
1011 assert!(
1013 writer.is_vector_index_built(embedding).await,
1014 "Vector index should be built after crossing threshold"
1015 );
1016
1017 let index = Index::open(dir.clone(), config.clone()).await.unwrap();
1019 assert!(
1020 index.trained_centroids.contains_key(&embedding.0),
1021 "Should have loaded trained centroids for embedding field"
1022 );
1023
1024 let segments = index.segment_readers().await.unwrap();
1026 let results = segments[0]
1027 .search_dense_vector(
1028 embedding,
1029 &query_vec,
1030 5,
1031 0,
1032 1,
1033 crate::query::MultiValueCombiner::Max,
1034 )
1035 .await
1036 .unwrap();
1037 assert!(
1038 !results.is_empty(),
1039 "Search should return results after build"
1040 );
1041
1042 let writer = IndexWriter::open(dir.clone(), config.clone())
1044 .await
1045 .unwrap();
1046 writer.build_vector_index().await.unwrap(); assert!(writer.is_vector_index_built(embedding).await);
1050 }
1051}