hermes_core/index/
mod.rs

1//! Index - multi-segment async search index
2//!
3//! Components:
4//! - Index: main entry point for searching
5//! - IndexWriter: for adding documents and committing segments (native only)
6//! - Supports multiple segments with merge
7
8use std::path::Path;
9use std::sync::Arc;
10
11use parking_lot::RwLock;
12
13use crate::DocId;
14use crate::directories::{Directory, SliceCachingDirectory};
15use crate::dsl::{Document, Field, Schema};
16use crate::error::{Error, Result};
17use crate::segment::{SegmentId, SegmentReader};
18use crate::structures::BlockPostingList;
19
20#[cfg(feature = "native")]
21mod writer;
22#[cfg(feature = "native")]
23pub use writer::IndexWriter;
24
25/// Default file name for the slice cache
26pub const SLICE_CACHE_FILENAME: &str = "index.slicecache";
27
28/// Index configuration
29#[derive(Debug, Clone)]
30pub struct IndexConfig {
31    /// Number of threads for CPU-intensive tasks (search parallelism)
32    pub num_threads: usize,
33    /// Number of parallel segment builders (documents distributed round-robin)
34    pub num_indexing_threads: usize,
35    /// Number of threads for parallel block compression within each segment
36    pub num_compression_threads: usize,
37    /// Block cache size for term dictionary per segment
38    pub term_cache_blocks: usize,
39    /// Block cache size for document store per segment
40    pub store_cache_blocks: usize,
41    /// Max documents per segment before auto-commit
42    pub max_docs_per_segment: u32,
43    /// Merge policy for background segment merging
44    pub merge_policy: Box<dyn crate::merge::MergePolicy>,
45    /// Index optimization mode (adaptive, size-optimized, performance-optimized)
46    pub optimization: crate::structures::IndexOptimization,
47}
48
49impl Default for IndexConfig {
50    fn default() -> Self {
51        #[cfg(feature = "native")]
52        let cpus = num_cpus::get().max(1);
53        #[cfg(not(feature = "native"))]
54        let cpus = 1;
55
56        Self {
57            num_threads: cpus,
58            num_indexing_threads: 1,
59            num_compression_threads: cpus,
60            term_cache_blocks: 256,
61            store_cache_blocks: 32,
62            max_docs_per_segment: 100_000,
63            merge_policy: Box::new(crate::merge::TieredMergePolicy::default()),
64            optimization: crate::structures::IndexOptimization::default(),
65        }
66    }
67}
68
69/// Multi-segment async Index
70///
71/// The main entry point for searching. Manages multiple segments
72/// and provides unified search across all of them.
73pub struct Index<D: Directory> {
74    directory: Arc<D>,
75    schema: Arc<Schema>,
76    config: IndexConfig,
77    segments: RwLock<Vec<Arc<SegmentReader>>>,
78    default_fields: Vec<crate::Field>,
79    tokenizers: Arc<crate::tokenizer::TokenizerRegistry>,
80    #[cfg(feature = "native")]
81    thread_pool: Arc<rayon::ThreadPool>,
82}
83
84impl<D: Directory> Index<D> {
85    /// Open an existing index from a directory
86    pub async fn open(directory: D, config: IndexConfig) -> Result<Self> {
87        let directory = Arc::new(directory);
88
89        // Read schema
90        let schema_slice = directory.open_read(Path::new("schema.json")).await?;
91        let schema_bytes = schema_slice.read_bytes().await?;
92        let schema: Schema = serde_json::from_slice(schema_bytes.as_slice())
93            .map_err(|e| Error::Serialization(e.to_string()))?;
94        let schema = Arc::new(schema);
95
96        // Read segment list
97        let segments = Self::load_segments(&directory, &schema, &config).await?;
98
99        #[cfg(feature = "native")]
100        let thread_pool = {
101            let pool = rayon::ThreadPoolBuilder::new()
102                .num_threads(config.num_threads)
103                .build()
104                .map_err(|e| Error::Io(std::io::Error::other(e)))?;
105            Arc::new(pool)
106        };
107
108        // Use schema's default_fields if specified, otherwise fall back to all indexed text fields
109        let default_fields: Vec<crate::Field> = if !schema.default_fields().is_empty() {
110            schema.default_fields().to_vec()
111        } else {
112            schema
113                .fields()
114                .filter(|(_, entry)| {
115                    entry.indexed && entry.field_type == crate::dsl::FieldType::Text
116                })
117                .map(|(field, _)| field)
118                .collect()
119        };
120
121        Ok(Self {
122            directory,
123            schema,
124            config,
125            segments: RwLock::new(segments),
126            default_fields,
127            tokenizers: Arc::new(crate::tokenizer::TokenizerRegistry::default()),
128            #[cfg(feature = "native")]
129            thread_pool,
130        })
131    }
132
133    async fn load_segments(
134        directory: &Arc<D>,
135        schema: &Arc<Schema>,
136        config: &IndexConfig,
137    ) -> Result<Vec<Arc<SegmentReader>>> {
138        // Read segments.json which lists all segment IDs
139        let segments_path = Path::new("segments.json");
140        if !directory.exists(segments_path).await? {
141            return Ok(Vec::new());
142        }
143
144        let segments_slice = directory.open_read(segments_path).await?;
145        let segments_bytes = segments_slice.read_bytes().await?;
146        let segment_ids: Vec<String> = serde_json::from_slice(segments_bytes.as_slice())
147            .map_err(|e| Error::Serialization(e.to_string()))?;
148
149        let mut segments = Vec::new();
150        let mut doc_id_offset = 0u32;
151
152        for id_str in segment_ids {
153            let segment_id = SegmentId::from_hex(&id_str)
154                .ok_or_else(|| Error::Corruption(format!("Invalid segment ID: {}", id_str)))?;
155            let reader = SegmentReader::open(
156                directory.as_ref(),
157                segment_id,
158                Arc::clone(schema),
159                doc_id_offset,
160                config.term_cache_blocks,
161            )
162            .await?;
163
164            doc_id_offset += reader.meta().num_docs;
165            segments.push(Arc::new(reader));
166        }
167
168        Ok(segments)
169    }
170
171    /// Get the schema
172    pub fn schema(&self) -> &Schema {
173        &self.schema
174    }
175
176    /// Get a reference to the underlying directory
177    pub fn directory(&self) -> &D {
178        &self.directory
179    }
180
181    /// Total number of documents across all segments
182    pub fn num_docs(&self) -> u32 {
183        self.segments.read().iter().map(|s| s.num_docs()).sum()
184    }
185
186    /// Get a document by global doc_id (async)
187    pub async fn doc(&self, doc_id: DocId) -> Result<Option<Document>> {
188        let segments = self.segments.read().clone();
189
190        let mut offset = 0u32;
191        for segment in segments.iter() {
192            let segment_docs = segment.meta().num_docs;
193            if doc_id < offset + segment_docs {
194                let local_doc_id = doc_id - offset;
195                return segment.doc(local_doc_id).await;
196            }
197            offset += segment_docs;
198        }
199
200        Ok(None)
201    }
202
203    /// Get posting lists for a term across all segments (async)
204    pub async fn get_postings(
205        &self,
206        field: Field,
207        term: &[u8],
208    ) -> Result<Vec<(Arc<SegmentReader>, BlockPostingList)>> {
209        let segments = self.segments.read().clone();
210        let mut results = Vec::new();
211
212        for segment in segments.iter() {
213            if let Some(postings) = segment.get_postings(field, term).await? {
214                results.push((Arc::clone(segment), postings));
215            }
216        }
217
218        Ok(results)
219    }
220
221    /// Execute CPU-intensive work on thread pool (native only)
222    #[cfg(feature = "native")]
223    pub async fn spawn_blocking<F, R>(&self, f: F) -> R
224    where
225        F: FnOnce() -> R + Send + 'static,
226        R: Send + 'static,
227    {
228        let (tx, rx) = tokio::sync::oneshot::channel();
229        self.thread_pool.spawn(move || {
230            let result = f();
231            let _ = tx.send(result);
232        });
233        rx.await.expect("Thread pool task panicked")
234    }
235
236    /// Get segment readers for query execution
237    pub fn segment_readers(&self) -> Vec<Arc<SegmentReader>> {
238        self.segments.read().clone()
239    }
240
241    /// Reload segments from directory (after new segments added)
242    pub async fn reload(&self) -> Result<()> {
243        let new_segments = Self::load_segments(&self.directory, &self.schema, &self.config).await?;
244        *self.segments.write() = new_segments;
245        Ok(())
246    }
247
248    /// Search across all segments
249    pub async fn search(
250        &self,
251        query: &dyn crate::query::Query,
252        limit: usize,
253    ) -> Result<Vec<crate::query::SearchResult>> {
254        let segments = self.segments.read().clone();
255        let mut all_results = Vec::new();
256
257        for segment in &segments {
258            let results = crate::query::search_segment(segment.as_ref(), query, limit).await?;
259            all_results.extend(results);
260        }
261
262        // Sort by score descending
263        all_results.sort_by(|a, b| {
264            b.score
265                .partial_cmp(&a.score)
266                .unwrap_or(std::cmp::Ordering::Equal)
267        });
268        all_results.truncate(limit);
269
270        Ok(all_results)
271    }
272
273    /// Search and return results with document addresses (no document content)
274    pub async fn search_with_addresses(
275        &self,
276        query: &dyn crate::query::Query,
277        limit: usize,
278    ) -> Result<crate::query::SearchResponse> {
279        self.search_with_addresses_offset(query, limit, 0).await
280    }
281
282    /// Search with offset for pagination
283    pub async fn search_with_addresses_offset(
284        &self,
285        query: &dyn crate::query::Query,
286        limit: usize,
287        offset: usize,
288    ) -> Result<crate::query::SearchResponse> {
289        let segments = self.segments.read().clone();
290        let mut all_results: Vec<(u128, crate::query::SearchResult)> = Vec::new();
291
292        // Fetch enough results to cover offset + limit
293        let fetch_limit = offset + limit;
294        for segment in &segments {
295            let segment_id = segment.meta().id;
296            let results =
297                crate::query::search_segment(segment.as_ref(), query, fetch_limit).await?;
298            for result in results {
299                all_results.push((segment_id, result));
300            }
301        }
302
303        // Sort by score descending
304        all_results.sort_by(|a, b| {
305            b.1.score
306                .partial_cmp(&a.1.score)
307                .unwrap_or(std::cmp::Ordering::Equal)
308        });
309
310        // Total hits before pagination
311        let total_hits = all_results.len() as u32;
312
313        // Apply offset and limit
314        let hits: Vec<crate::query::SearchHit> = all_results
315            .into_iter()
316            .skip(offset)
317            .take(limit)
318            .map(|(segment_id, result)| crate::query::SearchHit {
319                address: crate::query::DocAddress::new(segment_id, result.doc_id),
320                score: result.score,
321            })
322            .collect();
323
324        Ok(crate::query::SearchResponse { hits, total_hits })
325    }
326
327    /// Get a document by its unique address (segment_id + local doc_id)
328    pub async fn get_document(
329        &self,
330        address: &crate::query::DocAddress,
331    ) -> Result<Option<Document>> {
332        let segment_id = address
333            .segment_id_u128()
334            .ok_or_else(|| Error::Query(format!("Invalid segment ID: {}", address.segment_id)))?;
335
336        let segments = self.segments.read().clone();
337        for segment in &segments {
338            if segment.meta().id == segment_id {
339                return segment.doc(address.doc_id).await;
340            }
341        }
342
343        Ok(None)
344    }
345
346    /// Get the default fields for this index
347    pub fn default_fields(&self) -> &[crate::Field] {
348        &self.default_fields
349    }
350
351    /// Set the default fields for query parsing
352    pub fn set_default_fields(&mut self, fields: Vec<crate::Field>) {
353        self.default_fields = fields;
354    }
355
356    /// Get the tokenizer registry
357    pub fn tokenizers(&self) -> &Arc<crate::tokenizer::TokenizerRegistry> {
358        &self.tokenizers
359    }
360
361    /// Create a query parser for this index
362    ///
363    /// If the schema contains query router rules, they will be used to route
364    /// queries to specific fields based on regex patterns.
365    pub fn query_parser(&self) -> crate::dsl::QueryLanguageParser {
366        // Check if schema has query routers
367        let query_routers = self.schema.query_routers();
368        if !query_routers.is_empty() {
369            // Try to create a router from the schema's rules
370            if let Ok(router) = crate::dsl::QueryFieldRouter::from_rules(query_routers) {
371                return crate::dsl::QueryLanguageParser::with_router(
372                    Arc::clone(&self.schema),
373                    self.default_fields.clone(),
374                    Arc::clone(&self.tokenizers),
375                    router,
376                );
377            }
378        }
379
380        // Fall back to parser without router
381        crate::dsl::QueryLanguageParser::new(
382            Arc::clone(&self.schema),
383            self.default_fields.clone(),
384            Arc::clone(&self.tokenizers),
385        )
386    }
387
388    /// Parse and search using a query string
389    ///
390    /// Accepts both query language syntax (field:term, AND, OR, NOT, grouping)
391    /// and simple text (tokenized and searched across default fields).
392    /// Returns document addresses (segment_id + doc_id) without document content.
393    pub async fn query(
394        &self,
395        query_str: &str,
396        limit: usize,
397    ) -> Result<crate::query::SearchResponse> {
398        self.query_offset(query_str, limit, 0).await
399    }
400
401    /// Query with offset for pagination
402    pub async fn query_offset(
403        &self,
404        query_str: &str,
405        limit: usize,
406        offset: usize,
407    ) -> Result<crate::query::SearchResponse> {
408        let parser = self.query_parser();
409        let query = parser.parse(query_str).map_err(Error::Query)?;
410        self.search_with_addresses_offset(query.as_ref(), limit, offset)
411            .await
412    }
413}
414
415/// Methods for opening index with slice caching
416impl<D: Directory> Index<SliceCachingDirectory<D>> {
417    /// Open an index with slice caching, automatically loading the cache file if present
418    ///
419    /// This wraps the directory in a SliceCachingDirectory and attempts to load
420    /// any existing slice cache file to prefill the cache with hot data.
421    pub async fn open_with_cache(
422        directory: D,
423        config: IndexConfig,
424        cache_max_bytes: usize,
425    ) -> Result<Self> {
426        let caching_dir = SliceCachingDirectory::new(directory, cache_max_bytes);
427
428        // Try to load existing slice cache
429        let cache_path = Path::new(SLICE_CACHE_FILENAME);
430        if let Ok(true) = caching_dir.inner().exists(cache_path).await
431            && let Ok(slice) = caching_dir.inner().open_read(cache_path).await
432            && let Ok(bytes) = slice.read_bytes().await
433        {
434            let _ = caching_dir.deserialize(bytes.as_slice());
435        }
436
437        Self::open(caching_dir, config).await
438    }
439
440    /// Serialize the current slice cache to the index directory
441    ///
442    /// This saves all cached slices to a single file that can be loaded
443    /// on subsequent index opens for faster startup.
444    #[cfg(feature = "native")]
445    pub async fn save_slice_cache(&self) -> Result<()>
446    where
447        D: crate::directories::DirectoryWriter,
448    {
449        let cache_data = self.directory.serialize();
450        let cache_path = Path::new(SLICE_CACHE_FILENAME);
451        self.directory
452            .inner()
453            .write(cache_path, &cache_data)
454            .await?;
455        Ok(())
456    }
457
458    /// Get slice cache statistics
459    pub fn slice_cache_stats(&self) -> crate::directories::SliceCacheStats {
460        self.directory.stats()
461    }
462}
463
464/// Warm up the slice cache by opening an index and performing typical read operations
465///
466/// This function opens an index using a SliceCachingDirectory, performs operations
467/// that would typically be done during search (reading term dictionaries, posting lists),
468/// and then serializes the cache to a file for future use.
469///
470/// The resulting cache file contains all the "hot" data that was read during warmup,
471/// allowing subsequent index opens to prefill the cache and avoid cold-start latency.
472#[cfg(feature = "native")]
473pub async fn warmup_and_save_slice_cache<D: crate::directories::DirectoryWriter>(
474    directory: D,
475    config: IndexConfig,
476    cache_max_bytes: usize,
477) -> Result<()> {
478    let caching_dir = SliceCachingDirectory::new(directory, cache_max_bytes);
479    let index = Index::open(caching_dir, config).await?;
480
481    // Warm up by loading segment metadata and term dictionaries
482    // The SegmentReader::open already reads essential metadata
483    // Additional warmup can be done by iterating terms or doing sample queries
484
485    // Save the cache
486    index.save_slice_cache().await?;
487
488    Ok(())
489}
490
491#[cfg(feature = "native")]
492impl<D: Directory> Clone for Index<D> {
493    fn clone(&self) -> Self {
494        Self {
495            directory: Arc::clone(&self.directory),
496            schema: Arc::clone(&self.schema),
497            config: self.config.clone(),
498            segments: RwLock::new(self.segments.read().clone()),
499            default_fields: self.default_fields.clone(),
500            tokenizers: Arc::clone(&self.tokenizers),
501            thread_pool: Arc::clone(&self.thread_pool),
502        }
503    }
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509    use crate::directories::RamDirectory;
510    use crate::dsl::SchemaBuilder;
511
512    #[tokio::test]
513    async fn test_index_create_and_search() {
514        let mut schema_builder = SchemaBuilder::default();
515        let title = schema_builder.add_text_field("title", true, true);
516        let body = schema_builder.add_text_field("body", true, true);
517        let schema = schema_builder.build();
518
519        let dir = RamDirectory::new();
520        let config = IndexConfig::default();
521
522        // Create index and add documents
523        let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
524            .await
525            .unwrap();
526
527        let mut doc1 = Document::new();
528        doc1.add_text(title, "Hello World");
529        doc1.add_text(body, "This is the first document");
530        writer.add_document(doc1).await.unwrap();
531
532        let mut doc2 = Document::new();
533        doc2.add_text(title, "Goodbye World");
534        doc2.add_text(body, "This is the second document");
535        writer.add_document(doc2).await.unwrap();
536
537        writer.commit().await.unwrap();
538
539        // Open for reading
540        let index = Index::open(dir, config).await.unwrap();
541        assert_eq!(index.num_docs(), 2);
542
543        // Check postings
544        let postings = index.get_postings(title, b"world").await.unwrap();
545        assert_eq!(postings.len(), 1); // One segment
546        assert_eq!(postings[0].1.doc_count(), 2); // Two docs with "world"
547
548        // Retrieve document
549        let doc = index.doc(0).await.unwrap().unwrap();
550        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
551    }
552
553    #[tokio::test]
554    async fn test_multiple_segments() {
555        let mut schema_builder = SchemaBuilder::default();
556        let title = schema_builder.add_text_field("title", true, true);
557        let schema = schema_builder.build();
558
559        let dir = RamDirectory::new();
560        let config = IndexConfig {
561            max_docs_per_segment: 5, // Small segments for testing
562            ..Default::default()
563        };
564
565        let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
566            .await
567            .unwrap();
568
569        // Add documents in batches to create multiple segments
570        for batch in 0..3 {
571            for i in 0..5 {
572                let mut doc = Document::new();
573                doc.add_text(title, format!("Document {} batch {}", i, batch));
574                writer.add_document(doc).await.unwrap();
575            }
576            writer.commit().await.unwrap();
577        }
578
579        // Open and check
580        let index = Index::open(dir, config).await.unwrap();
581        assert_eq!(index.num_docs(), 15);
582        assert_eq!(index.segment_readers().len(), 3);
583    }
584
585    #[tokio::test]
586    async fn test_segment_merge() {
587        let mut schema_builder = SchemaBuilder::default();
588        let title = schema_builder.add_text_field("title", true, true);
589        let schema = schema_builder.build();
590
591        let dir = RamDirectory::new();
592        let config = IndexConfig {
593            max_docs_per_segment: 3,
594            ..Default::default()
595        };
596
597        let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
598            .await
599            .unwrap();
600
601        // Create multiple segments
602        for i in 0..9 {
603            let mut doc = Document::new();
604            doc.add_text(title, format!("Document {}", i));
605            writer.add_document(doc).await.unwrap();
606        }
607        writer.commit().await.unwrap();
608
609        // Should have 3 segments
610        let index = Index::open(dir.clone(), config.clone()).await.unwrap();
611        assert_eq!(index.segment_readers().len(), 3);
612
613        // Force merge
614        let writer = IndexWriter::open(dir.clone(), config.clone())
615            .await
616            .unwrap();
617        writer.force_merge().await.unwrap();
618
619        // Should have 1 segment now
620        let index = Index::open(dir, config).await.unwrap();
621        assert_eq!(index.segment_readers().len(), 1);
622        assert_eq!(index.num_docs(), 9);
623
624        // Verify all documents accessible
625        for i in 0..9 {
626            let doc = index.doc(i).await.unwrap().unwrap();
627            assert_eq!(
628                doc.get_first(title).unwrap().as_text(),
629                Some(format!("Document {}", i).as_str())
630            );
631        }
632    }
633
634    #[tokio::test]
635    async fn test_match_query() {
636        let mut schema_builder = SchemaBuilder::default();
637        let title = schema_builder.add_text_field("title", true, true);
638        let body = schema_builder.add_text_field("body", true, true);
639        let schema = schema_builder.build();
640
641        let dir = RamDirectory::new();
642        let config = IndexConfig::default();
643
644        let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
645            .await
646            .unwrap();
647
648        let mut doc1 = Document::new();
649        doc1.add_text(title, "rust programming");
650        doc1.add_text(body, "Learn rust language");
651        writer.add_document(doc1).await.unwrap();
652
653        let mut doc2 = Document::new();
654        doc2.add_text(title, "python programming");
655        doc2.add_text(body, "Learn python language");
656        writer.add_document(doc2).await.unwrap();
657
658        writer.commit().await.unwrap();
659
660        let index = Index::open(dir, config).await.unwrap();
661
662        // Test match query with multiple default fields
663        let results = index.query("rust", 10).await.unwrap();
664        assert_eq!(results.hits.len(), 1);
665
666        // Test match query with multiple tokens
667        let results = index.query("rust programming", 10).await.unwrap();
668        assert!(!results.hits.is_empty());
669
670        // Verify hit has address (segment_id + doc_id)
671        let hit = &results.hits[0];
672        assert!(!hit.address.segment_id.is_empty(), "Should have segment_id");
673
674        // Verify document retrieval by address
675        let doc = index.get_document(&hit.address).await.unwrap().unwrap();
676        assert!(
677            !doc.field_values().is_empty(),
678            "Doc should have field values"
679        );
680
681        // Also verify doc retrieval directly by global doc_id
682        let doc = index.doc(0).await.unwrap().unwrap();
683        assert!(
684            !doc.field_values().is_empty(),
685            "Doc should have field values"
686        );
687    }
688
689    #[tokio::test]
690    async fn test_slice_cache_warmup_and_load() {
691        use crate::directories::SliceCachingDirectory;
692
693        let mut schema_builder = SchemaBuilder::default();
694        let title = schema_builder.add_text_field("title", true, true);
695        let body = schema_builder.add_text_field("body", true, true);
696        let schema = schema_builder.build();
697
698        let dir = RamDirectory::new();
699        let config = IndexConfig::default();
700
701        // Create index with some documents
702        let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
703            .await
704            .unwrap();
705
706        for i in 0..10 {
707            let mut doc = Document::new();
708            doc.add_text(title, format!("Document {} about rust", i));
709            doc.add_text(body, format!("This is body text number {}", i));
710            writer.add_document(doc).await.unwrap();
711        }
712        writer.commit().await.unwrap();
713
714        // Open with slice caching and perform some operations to warm up cache
715        let caching_dir = SliceCachingDirectory::new(dir.clone(), 1024 * 1024);
716        let index = Index::open(caching_dir, config.clone()).await.unwrap();
717
718        // Perform a search to warm up the cache
719        let results = index.query("rust", 10).await.unwrap();
720        assert!(!results.hits.is_empty());
721
722        // Check cache stats - should have cached some data
723        let stats = index.slice_cache_stats();
724        assert!(stats.total_bytes > 0, "Cache should have data after search");
725
726        // Save the cache
727        index.save_slice_cache().await.unwrap();
728
729        // Verify cache file was written
730        assert!(dir.exists(Path::new(SLICE_CACHE_FILENAME)).await.unwrap());
731
732        // Now open with cache loading
733        let index2 = Index::open_with_cache(dir.clone(), config.clone(), 1024 * 1024)
734            .await
735            .unwrap();
736
737        // Cache should be prefilled
738        let stats2 = index2.slice_cache_stats();
739        assert!(
740            stats2.total_bytes > 0,
741            "Cache should be prefilled from file"
742        );
743
744        // Search should still work
745        let results2 = index2.query("rust", 10).await.unwrap();
746        assert_eq!(results.hits.len(), results2.hits.len());
747    }
748
749    #[tokio::test]
750    async fn test_multivalue_field_indexing_and_search() {
751        let mut schema_builder = SchemaBuilder::default();
752        let uris = schema_builder.add_text_field("uris", true, true);
753        let title = schema_builder.add_text_field("title", true, true);
754        let schema = schema_builder.build();
755
756        let dir = RamDirectory::new();
757        let config = IndexConfig::default();
758
759        // Create index and add document with multi-value field
760        let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
761            .await
762            .unwrap();
763
764        let mut doc = Document::new();
765        doc.add_text(uris, "one");
766        doc.add_text(uris, "two");
767        doc.add_text(title, "Test Document");
768        writer.add_document(doc).await.unwrap();
769
770        // Add another document with different uris
771        let mut doc2 = Document::new();
772        doc2.add_text(uris, "three");
773        doc2.add_text(title, "Another Document");
774        writer.add_document(doc2).await.unwrap();
775
776        writer.commit().await.unwrap();
777
778        // Open for reading
779        let index = Index::open(dir, config).await.unwrap();
780        assert_eq!(index.num_docs(), 2);
781
782        // Verify document retrieval preserves all values
783        let doc = index.doc(0).await.unwrap().unwrap();
784        let all_uris: Vec<_> = doc.get_all(uris).collect();
785        assert_eq!(all_uris.len(), 2, "Should have 2 uris values");
786        assert_eq!(all_uris[0].as_text(), Some("one"));
787        assert_eq!(all_uris[1].as_text(), Some("two"));
788
789        // Verify to_json returns array for multi-value field
790        let json = doc.to_json(index.schema());
791        let uris_json = json.get("uris").unwrap();
792        assert!(uris_json.is_array(), "Multi-value field should be an array");
793        let uris_arr = uris_json.as_array().unwrap();
794        assert_eq!(uris_arr.len(), 2);
795        assert_eq!(uris_arr[0].as_str(), Some("one"));
796        assert_eq!(uris_arr[1].as_str(), Some("two"));
797
798        // Verify both values are searchable
799        let results = index.query("uris:one", 10).await.unwrap();
800        assert_eq!(results.hits.len(), 1, "Should find doc with 'one'");
801        assert_eq!(results.hits[0].address.doc_id, 0);
802
803        let results = index.query("uris:two", 10).await.unwrap();
804        assert_eq!(results.hits.len(), 1, "Should find doc with 'two'");
805        assert_eq!(results.hits[0].address.doc_id, 0);
806
807        let results = index.query("uris:three", 10).await.unwrap();
808        assert_eq!(results.hits.len(), 1, "Should find doc with 'three'");
809        assert_eq!(results.hits[0].address.doc_id, 1);
810
811        // Verify searching for non-existent value returns no results
812        let results = index.query("uris:nonexistent", 10).await.unwrap();
813        assert_eq!(results.hits.len(), 0, "Should not find non-existent value");
814    }
815}
hermes_core/index/mod.rs

hermes_core/index/
mod.rs