ipfrs_semantic/
hybrid.rs

1//! Hybrid search combining vector similarity with metadata filtering
2//!
3//! This module provides a unified search interface that combines
4//! semantic vector search with attribute-based filtering.
5
6use crate::hnsw::{DistanceMetric, SearchResult, VectorIndex};
7use crate::metadata::{Metadata, MetadataFilter, MetadataStore, TemporalOptions};
8use crate::stats::{IndexHealth, IndexStats, MemoryUsage, PerfTimer, StatsSnapshot};
9use ipfrs_core::{Cid, Error, Result};
10use lru::LruCache;
11use serde::{Deserialize, Serialize};
12use std::collections::HashSet;
13use std::num::NonZeroUsize;
14use std::sync::{Arc, RwLock};
15
16/// Hybrid search configuration
17#[derive(Debug, Clone)]
18pub struct HybridConfig {
19    /// Vector dimension
20    pub dimension: usize,
21    /// Distance metric
22    pub metric: DistanceMetric,
23    /// HNSW max connections
24    pub max_connections: usize,
25    /// HNSW ef_construction
26    pub ef_construction: usize,
27    /// Default ef_search
28    pub ef_search: usize,
29    /// Query cache size
30    pub cache_size: usize,
31    /// Enable statistics collection
32    pub collect_stats: bool,
33    /// Filtering strategy
34    pub filter_strategy: FilterStrategy,
35}
36
37impl Default for HybridConfig {
38    fn default() -> Self {
39        Self {
40            dimension: 768,
41            metric: DistanceMetric::Cosine,
42            max_connections: 16,
43            ef_construction: 200,
44            ef_search: 50,
45            cache_size: 1000,
46            collect_stats: true,
47            filter_strategy: FilterStrategy::Auto,
48        }
49    }
50}
51
52/// Strategy for applying filters
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54pub enum FilterStrategy {
55    /// Automatically choose based on selectivity
56    Auto,
57    /// Filter before vector search (pre-filtering)
58    PreFilter,
59    /// Filter after vector search (post-filtering)
60    PostFilter,
61}
62
63/// Hybrid search query
64#[derive(Debug, Clone)]
65pub struct HybridQuery {
66    /// Query vector
67    pub vector: Vec<f32>,
68    /// Number of results to return
69    pub k: usize,
70    /// Metadata filter (optional)
71    pub filter: Option<MetadataFilter>,
72    /// Temporal options (optional)
73    pub temporal: Option<TemporalOptions>,
74    /// Minimum similarity score
75    pub min_score: Option<f32>,
76    /// Override ef_search parameter
77    pub ef_search: Option<usize>,
78    /// Include metadata in results
79    pub include_metadata: bool,
80}
81
82impl HybridQuery {
83    /// Create a simple k-NN query
84    pub fn knn(vector: Vec<f32>, k: usize) -> Self {
85        Self {
86            vector,
87            k,
88            filter: None,
89            temporal: None,
90            min_score: None,
91            ef_search: None,
92            include_metadata: false,
93        }
94    }
95
96    /// Add a metadata filter
97    pub fn with_filter(mut self, filter: MetadataFilter) -> Self {
98        self.filter = Some(filter);
99        self
100    }
101
102    /// Add temporal options
103    pub fn with_temporal(mut self, temporal: TemporalOptions) -> Self {
104        self.temporal = Some(temporal);
105        self
106    }
107
108    /// Set minimum score threshold
109    pub fn with_min_score(mut self, min_score: f32) -> Self {
110        self.min_score = Some(min_score);
111        self
112    }
113
114    /// Include metadata in results
115    pub fn with_metadata(mut self) -> Self {
116        self.include_metadata = true;
117        self
118    }
119
120    /// Override ef_search parameter
121    pub fn with_ef_search(mut self, ef_search: usize) -> Self {
122        self.ef_search = Some(ef_search);
123        self
124    }
125}
126
127/// Hybrid search result with optional metadata
128#[derive(Debug, Clone)]
129pub struct HybridResult {
130    /// Content identifier
131    pub cid: Cid,
132    /// Similarity score
133    pub score: f32,
134    /// Metadata (if requested)
135    pub metadata: Option<Metadata>,
136}
137
138impl From<SearchResult> for HybridResult {
139    fn from(result: SearchResult) -> Self {
140        Self {
141            cid: result.cid,
142            score: result.score,
143            metadata: None,
144        }
145    }
146}
147
148/// Hybrid search response
149#[derive(Debug, Clone)]
150pub struct HybridResponse {
151    /// Search results
152    pub results: Vec<HybridResult>,
153    /// Total candidates evaluated
154    pub total_evaluated: usize,
155    /// Search latency in microseconds
156    pub latency_us: u64,
157    /// Filter strategy used
158    pub strategy_used: FilterStrategy,
159}
160
161/// Hybrid search index combining HNSW with metadata
162pub struct HybridIndex {
163    /// Vector index
164    vector_index: Arc<RwLock<VectorIndex>>,
165    /// Metadata store
166    metadata_store: Arc<MetadataStore>,
167    /// Configuration
168    config: HybridConfig,
169    /// Statistics
170    stats: Arc<IndexStats>,
171    /// Query cache
172    cache: Arc<RwLock<LruCache<u64, Vec<HybridResult>>>>,
173}
174
175impl HybridIndex {
176    /// Create a new hybrid index
177    pub fn new(config: HybridConfig) -> Result<Self> {
178        let vector_index = VectorIndex::new(
179            config.dimension,
180            config.metric,
181            config.max_connections,
182            config.ef_construction,
183        )?;
184
185        let cache_size =
186            NonZeroUsize::new(config.cache_size).unwrap_or(NonZeroUsize::new(1000).unwrap());
187
188        Ok(Self {
189            vector_index: Arc::new(RwLock::new(vector_index)),
190            metadata_store: Arc::new(MetadataStore::new()),
191            config,
192            stats: Arc::new(IndexStats::new()),
193            cache: Arc::new(RwLock::new(LruCache::new(cache_size))),
194        })
195    }
196
197    /// Create with default configuration
198    pub fn with_defaults() -> Result<Self> {
199        Self::new(HybridConfig::default())
200    }
201
202    /// Insert a vector with metadata
203    pub fn insert(&self, cid: &Cid, vector: &[f32], metadata: Option<Metadata>) -> Result<()> {
204        let timer = PerfTimer::start();
205
206        // Insert into vector index
207        self.vector_index.write().unwrap().insert(cid, vector)?;
208
209        // Insert metadata if provided
210        if let Some(meta) = metadata {
211            self.metadata_store.insert(*cid, meta)?;
212        } else {
213            // Create minimal metadata with timestamp
214            self.metadata_store.insert(*cid, Metadata::new())?;
215        }
216
217        if self.config.collect_stats {
218            self.stats.record_insert(timer.stop());
219        }
220
221        // Invalidate cache
222        self.cache.write().unwrap().clear();
223
224        Ok(())
225    }
226
227    /// Insert multiple vectors with metadata in batch
228    pub fn insert_batch(&self, items: &[(Cid, Vec<f32>, Option<Metadata>)]) -> Result<()> {
229        for (cid, vector, metadata) in items {
230            self.insert(cid, vector, metadata.clone())?;
231        }
232        Ok(())
233    }
234
235    /// Delete a vector and its metadata
236    pub fn delete(&self, cid: &Cid) -> Result<()> {
237        self.vector_index.write().unwrap().delete(cid)?;
238        self.metadata_store.remove(cid)?;
239
240        if self.config.collect_stats {
241            self.stats.record_delete();
242        }
243
244        // Invalidate cache
245        self.cache.write().unwrap().clear();
246
247        Ok(())
248    }
249
250    /// Perform hybrid search
251    pub async fn search(&self, query: HybridQuery) -> Result<HybridResponse> {
252        let timer = PerfTimer::start();
253
254        // Determine filter strategy
255        let strategy = self.determine_strategy(&query);
256        let mut total_evaluated = 0;
257
258        let results = match strategy {
259            FilterStrategy::PreFilter => {
260                self.search_pre_filter(&query, &mut total_evaluated).await?
261            }
262            FilterStrategy::PostFilter | FilterStrategy::Auto => {
263                self.search_post_filter(&query, &mut total_evaluated)
264                    .await?
265            }
266        };
267
268        let latency = timer.stop();
269
270        if self.config.collect_stats {
271            self.stats.record_search(latency, query.k, results.len());
272        }
273
274        Ok(HybridResponse {
275            results,
276            total_evaluated,
277            latency_us: latency.as_micros() as u64,
278            strategy_used: strategy,
279        })
280    }
281
282    /// Pre-filter strategy: filter first, then search on subset
283    async fn search_pre_filter(
284        &self,
285        query: &HybridQuery,
286        total_evaluated: &mut usize,
287    ) -> Result<Vec<HybridResult>> {
288        // Get candidate CIDs from filter
289        let candidates: HashSet<Cid> = if let Some(ref filter) = query.filter {
290            self.metadata_store.filter(filter).into_iter().collect()
291        } else {
292            // No filter, use all CIDs
293            self.metadata_store.cids().into_iter().collect()
294        };
295
296        // Apply temporal filter if present
297        let candidates = if let Some(ref temporal) = query.temporal {
298            let time_filtered = self
299                .metadata_store
300                .get_by_time_range(temporal.start, temporal.end);
301            candidates
302                .intersection(&time_filtered.into_iter().collect())
303                .copied()
304                .collect()
305        } else {
306            candidates
307        };
308
309        *total_evaluated = candidates.len();
310
311        if candidates.is_empty() {
312            return Ok(Vec::new());
313        }
314
315        // Search vector index
316        let ef_search = query.ef_search.unwrap_or(self.config.ef_search);
317        let fetch_k = (query.k * 3).max(100); // Fetch more to account for filtering
318
319        let search_results =
320            self.vector_index
321                .read()
322                .unwrap()
323                .search(&query.vector, fetch_k, ef_search)?;
324
325        // Filter results to candidates
326        let mut results: Vec<HybridResult> = search_results
327            .into_iter()
328            .filter(|r| candidates.contains(&r.cid))
329            .map(|r| {
330                let mut hr = HybridResult::from(r);
331                // Apply recency boost
332                if let Some(ref temporal) = query.temporal {
333                    if let Some(meta) = self.metadata_store.get(&hr.cid) {
334                        let boost = temporal.recency_multiplier(meta.created_at);
335                        hr.score *= boost;
336                    }
337                }
338                hr
339            })
340            .collect();
341
342        // Apply min score filter
343        if let Some(min_score) = query.min_score {
344            results.retain(|r| r.score >= min_score);
345        }
346
347        // Sort by score and truncate
348        results.sort_by(|a, b| {
349            b.score
350                .partial_cmp(&a.score)
351                .unwrap_or(std::cmp::Ordering::Equal)
352        });
353        results.truncate(query.k);
354
355        // Add metadata if requested
356        if query.include_metadata {
357            for result in &mut results {
358                result.metadata = self.metadata_store.get(&result.cid);
359            }
360        }
361
362        Ok(results)
363    }
364
365    /// Post-filter strategy: search first, then filter results
366    async fn search_post_filter(
367        &self,
368        query: &HybridQuery,
369        total_evaluated: &mut usize,
370    ) -> Result<Vec<HybridResult>> {
371        let ef_search = query.ef_search.unwrap_or(self.config.ef_search);
372
373        // Fetch more results to account for filtering
374        let fetch_k = if query.filter.is_some() || query.temporal.is_some() {
375            (query.k * 5).max(100)
376        } else {
377            query.k
378        };
379
380        let search_results =
381            self.vector_index
382                .read()
383                .unwrap()
384                .search(&query.vector, fetch_k, ef_search)?;
385
386        *total_evaluated = search_results.len();
387
388        let mut results: Vec<HybridResult> = search_results
389            .into_iter()
390            .filter_map(|r| {
391                // Apply metadata filter
392                if let Some(ref filter) = query.filter {
393                    if let Some(meta) = self.metadata_store.get(&r.cid) {
394                        if !filter.matches(&meta) {
395                            return None;
396                        }
397                    } else {
398                        return None; // No metadata, filter out
399                    }
400                }
401
402                // Apply temporal filter
403                if let Some(ref temporal) = query.temporal {
404                    if let Some(meta) = self.metadata_store.get(&r.cid) {
405                        if let (Some(start), Some(end)) = (temporal.start, temporal.end) {
406                            if meta.created_at < start || meta.created_at > end {
407                                return None;
408                            }
409                        }
410                    }
411                }
412
413                let mut hr = HybridResult::from(r);
414
415                // Apply recency boost
416                if let Some(ref temporal) = query.temporal {
417                    if let Some(meta) = self.metadata_store.get(&hr.cid) {
418                        let boost = temporal.recency_multiplier(meta.created_at);
419                        hr.score *= boost;
420                    }
421                }
422
423                Some(hr)
424            })
425            .collect();
426
427        // Apply min score filter
428        if let Some(min_score) = query.min_score {
429            results.retain(|r| r.score >= min_score);
430        }
431
432        // Re-sort if recency boost was applied
433        if query.temporal.is_some() {
434            results.sort_by(|a, b| {
435                b.score
436                    .partial_cmp(&a.score)
437                    .unwrap_or(std::cmp::Ordering::Equal)
438            });
439        }
440
441        results.truncate(query.k);
442
443        // Add metadata if requested
444        if query.include_metadata {
445            for result in &mut results {
446                result.metadata = self.metadata_store.get(&result.cid);
447            }
448        }
449
450        Ok(results)
451    }
452
453    /// Determine the best filter strategy
454    fn determine_strategy(&self, query: &HybridQuery) -> FilterStrategy {
455        if self.config.filter_strategy != FilterStrategy::Auto {
456            return self.config.filter_strategy;
457        }
458
459        // Estimate selectivity
460        let total_count = self.metadata_store.len();
461        if total_count == 0 {
462            return FilterStrategy::PostFilter;
463        }
464
465        // If no filter, use post-filter (simpler path)
466        if query.filter.is_none() && query.temporal.is_none() {
467            return FilterStrategy::PostFilter;
468        }
469
470        // Estimate filter selectivity
471        let filtered_count = if let Some(ref filter) = query.filter {
472            self.metadata_store.filter(filter).len()
473        } else {
474            total_count
475        };
476
477        let selectivity = filtered_count as f64 / total_count as f64;
478
479        // Pre-filter if highly selective (< 10% of data)
480        // Post-filter if less selective (more data passes)
481        if selectivity < 0.1 {
482            FilterStrategy::PreFilter
483        } else {
484            FilterStrategy::PostFilter
485        }
486    }
487
488    /// Get the number of indexed vectors
489    pub fn len(&self) -> usize {
490        self.vector_index.read().unwrap().len()
491    }
492
493    /// Check if the index is empty
494    pub fn is_empty(&self) -> bool {
495        self.len() == 0
496    }
497
498    /// Check if a CID exists
499    pub fn contains(&self, cid: &Cid) -> bool {
500        self.vector_index.read().unwrap().contains(cid)
501    }
502
503    /// Get metadata for a CID
504    pub fn get_metadata(&self, cid: &Cid) -> Option<Metadata> {
505        self.metadata_store.get(cid)
506    }
507
508    /// Update metadata for a CID (without changing the vector)
509    pub fn update_metadata(&self, cid: &Cid, metadata: Metadata) -> Result<()> {
510        if !self.contains(cid) {
511            return Err(Error::NotFound(format!("CID not in index: {}", cid)));
512        }
513        self.metadata_store.insert(*cid, metadata)?;
514        Ok(())
515    }
516
517    /// Get statistics snapshot
518    pub fn stats(&self) -> StatsSnapshot {
519        self.stats.snapshot()
520    }
521
522    /// Get index health metrics
523    pub fn health(&self) -> IndexHealth {
524        let stats = self.stats.snapshot();
525        IndexHealth::analyze(self.len(), self.config.dimension, Some(&stats))
526    }
527
528    /// Get memory usage estimate
529    pub fn memory_usage(&self) -> MemoryUsage {
530        MemoryUsage::estimate(
531            self.len(),
532            self.config.dimension,
533            self.metadata_store.len(),
534            self.config.cache_size,
535        )
536    }
537
538    /// Get facet counts for a field
539    pub fn facet_counts(&self, field: &str) -> std::collections::HashMap<String, usize> {
540        self.metadata_store.get_facet_counts(field)
541    }
542
543    /// Clear the search cache
544    pub fn clear_cache(&self) {
545        self.cache.write().unwrap().clear();
546    }
547
548    /// Reset statistics
549    pub fn reset_stats(&self) {
550        self.stats.reset();
551    }
552
553    /// Save the index to a path
554    pub async fn save(&self, path: impl AsRef<std::path::Path>) -> Result<()> {
555        self.vector_index.read().unwrap().save(path)
556    }
557
558    /// Clear all data
559    pub fn clear(&self) -> Result<()> {
560        // Create new empty vector index
561        let new_index = VectorIndex::new(
562            self.config.dimension,
563            self.config.metric,
564            self.config.max_connections,
565            self.config.ef_construction,
566        )?;
567
568        *self.vector_index.write().unwrap() = new_index;
569        self.metadata_store.clear();
570        self.cache.write().unwrap().clear();
571        self.stats.reset();
572
573        Ok(())
574    }
575
576    /// Prune entries older than the given TTL (time-to-live in seconds)
577    ///
578    /// Removes vectors and metadata for entries that were created more than
579    /// `ttl_seconds` ago.
580    ///
581    /// # Arguments
582    /// * `ttl_seconds` - Maximum age in seconds for entries to keep
583    ///
584    /// # Returns
585    /// Number of entries pruned
586    pub fn prune_by_ttl(&self, ttl_seconds: u64) -> Result<usize> {
587        let now = std::time::SystemTime::now()
588            .duration_since(std::time::UNIX_EPOCH)
589            .unwrap_or_default()
590            .as_secs();
591
592        let cutoff = now.saturating_sub(ttl_seconds);
593
594        self.prune_older_than(cutoff)
595    }
596
597    /// Prune entries created before a specific timestamp
598    ///
599    /// # Arguments
600    /// * `timestamp` - Unix timestamp; entries created before this are removed
601    ///
602    /// # Returns
603    /// Number of entries pruned
604    pub fn prune_older_than(&self, timestamp: u64) -> Result<usize> {
605        // Find CIDs to remove
606        let cids_to_remove: Vec<Cid> = self
607            .metadata_store
608            .cids()
609            .into_iter()
610            .filter(|cid| {
611                self.metadata_store
612                    .get(cid)
613                    .map(|m| m.created_at < timestamp)
614                    .unwrap_or(false)
615            })
616            .collect();
617
618        let count = cids_to_remove.len();
619
620        // Remove from both indexes
621        for cid in &cids_to_remove {
622            let _ = self.vector_index.write().unwrap().delete(cid);
623            let _ = self.metadata_store.remove(cid);
624        }
625
626        // Clear cache since data has changed
627        self.cache.write().unwrap().clear();
628
629        Ok(count)
630    }
631
632    /// Prune entries keeping only the N most recently created
633    ///
634    /// # Arguments
635    /// * `max_entries` - Maximum number of entries to keep
636    ///
637    /// # Returns
638    /// Number of entries pruned
639    pub fn prune_to_max_entries(&self, max_entries: usize) -> Result<usize> {
640        let current_count = self.len();
641        if current_count <= max_entries {
642            return Ok(0);
643        }
644
645        // Get all CIDs with their creation timestamps
646        let mut entries: Vec<(Cid, u64)> = self
647            .metadata_store
648            .cids()
649            .into_iter()
650            .filter_map(|cid| self.metadata_store.get(&cid).map(|m| (cid, m.created_at)))
651            .collect();
652
653        // Sort by creation time (oldest first)
654        entries.sort_by_key(|(_, ts)| *ts);
655
656        // Calculate how many to remove
657        let to_remove = current_count - max_entries;
658
659        // Remove the oldest entries
660        for (cid, _) in entries.iter().take(to_remove) {
661            let _ = self.vector_index.write().unwrap().delete(cid);
662            let _ = self.metadata_store.remove(cid);
663        }
664
665        // Clear cache
666        self.cache.write().unwrap().clear();
667
668        Ok(to_remove)
669    }
670
671    /// Prune entries by LRU (Least Recently Updated)
672    ///
673    /// Removes entries that haven't been updated recently, keeping
674    /// only the most recently updated entries.
675    ///
676    /// # Arguments
677    /// * `max_entries` - Maximum number of entries to keep
678    ///
679    /// # Returns
680    /// Number of entries pruned
681    pub fn prune_lru(&self, max_entries: usize) -> Result<usize> {
682        let current_count = self.len();
683        if current_count <= max_entries {
684            return Ok(0);
685        }
686
687        // Get all CIDs with their update timestamps
688        let mut entries: Vec<(Cid, u64)> = self
689            .metadata_store
690            .cids()
691            .into_iter()
692            .filter_map(|cid| self.metadata_store.get(&cid).map(|m| (cid, m.updated_at)))
693            .collect();
694
695        // Sort by update time (least recent first)
696        entries.sort_by_key(|(_, ts)| *ts);
697
698        // Calculate how many to remove
699        let to_remove = current_count - max_entries;
700
701        // Remove the least recently updated entries
702        for (cid, _) in entries.iter().take(to_remove) {
703            let _ = self.vector_index.write().unwrap().delete(cid);
704            let _ = self.metadata_store.remove(cid);
705        }
706
707        // Clear cache
708        self.cache.write().unwrap().clear();
709
710        Ok(to_remove)
711    }
712
713    /// Get pruning statistics
714    pub fn pruning_stats(&self) -> PruningStats {
715        let now = std::time::SystemTime::now()
716            .duration_since(std::time::UNIX_EPOCH)
717            .unwrap_or_default()
718            .as_secs();
719
720        let entries: Vec<(u64, u64)> = self
721            .metadata_store
722            .cids()
723            .into_iter()
724            .filter_map(|cid| {
725                self.metadata_store
726                    .get(&cid)
727                    .map(|m| (m.created_at, m.updated_at))
728            })
729            .collect();
730
731        if entries.is_empty() {
732            return PruningStats::default();
733        }
734
735        let oldest_created = entries.iter().map(|(c, _)| *c).min().unwrap_or(now);
736        let newest_created = entries.iter().map(|(c, _)| *c).max().unwrap_or(now);
737        let oldest_updated = entries.iter().map(|(_, u)| *u).min().unwrap_or(now);
738
739        let age_1day = entries.iter().filter(|(c, _)| now - *c < 86400).count();
740        let age_7days = entries.iter().filter(|(c, _)| now - *c < 86400 * 7).count();
741        let age_30days = entries
742            .iter()
743            .filter(|(c, _)| now - *c < 86400 * 30)
744            .count();
745
746        PruningStats {
747            total_entries: entries.len(),
748            oldest_entry_age: now.saturating_sub(oldest_created),
749            newest_entry_age: now.saturating_sub(newest_created),
750            oldest_update_age: now.saturating_sub(oldest_updated),
751            entries_last_day: age_1day,
752            entries_last_week: age_7days,
753            entries_last_month: age_30days,
754        }
755    }
756}
757
758/// Pruning statistics
759#[derive(Debug, Clone, Default, Serialize, Deserialize)]
760pub struct PruningStats {
761    /// Total number of entries
762    pub total_entries: usize,
763    /// Age of the oldest entry in seconds
764    pub oldest_entry_age: u64,
765    /// Age of the newest entry in seconds
766    pub newest_entry_age: u64,
767    /// Age of the least recently updated entry in seconds
768    pub oldest_update_age: u64,
769    /// Number of entries created in the last day
770    pub entries_last_day: usize,
771    /// Number of entries created in the last week
772    pub entries_last_week: usize,
773    /// Number of entries created in the last month
774    pub entries_last_month: usize,
775}
776
777impl PruningStats {
778    /// Get a summary string
779    pub fn summary(&self) -> String {
780        format!(
781            "Total: {}, Last day: {}, Last week: {}, Last month: {}, Oldest: {}s ago",
782            self.total_entries,
783            self.entries_last_day,
784            self.entries_last_week,
785            self.entries_last_month,
786            self.oldest_entry_age
787        )
788    }
789
790    /// Estimate entries that would be pruned for a given TTL
791    pub fn would_prune_for_ttl(&self, ttl_seconds: u64) -> usize {
792        // Approximate based on time buckets
793        if ttl_seconds < 86400 {
794            self.total_entries - self.entries_last_day
795        } else if ttl_seconds < 86400 * 7 {
796            self.total_entries - self.entries_last_week
797        } else if ttl_seconds < 86400 * 30 {
798            self.total_entries - self.entries_last_month
799        } else {
800            0
801        }
802    }
803}
804
805#[cfg(test)]
806mod tests {
807    use super::*;
808    use crate::metadata::MetadataValue;
809
810    fn test_cid(n: u8) -> Cid {
811        // Use different valid CID strings
812        let cids = [
813            "bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi",
814            "bafybeiczsscdsbs7ffqz55asqdf3smv6klcw3gofszvwlyarci47bgf354",
815            "bafybeibvfkifsqbapirjrj7zbfwddz5qz5awvbftjgktpcqcxjkzstszlm",
816        ];
817        cids[n as usize % cids.len()].parse().unwrap()
818    }
819
820    #[tokio::test]
821    async fn test_hybrid_index_basic() {
822        let config = HybridConfig {
823            dimension: 4,
824            ..Default::default()
825        };
826
827        let index = HybridIndex::new(config).unwrap();
828
829        let cid1 = test_cid(0);
830        let vec1 = vec![1.0, 0.0, 0.0, 0.0];
831        let meta1 = Metadata::new().with_string("type", "image");
832
833        index.insert(&cid1, &vec1, Some(meta1)).unwrap();
834
835        assert_eq!(index.len(), 1);
836        assert!(index.contains(&cid1));
837    }
838
839    #[tokio::test]
840    async fn test_hybrid_search() {
841        let config = HybridConfig {
842            dimension: 4,
843            ..Default::default()
844        };
845
846        let index = HybridIndex::new(config).unwrap();
847
848        // Insert some vectors with metadata (more vectors for better HNSW graph connectivity)
849        let cid1 = test_cid(0);
850        let vec1 = vec![1.0, 0.0, 0.0, 0.0];
851        let meta1 = Metadata::new()
852            .with_string("type", "image")
853            .with_integer("size", 1024);
854
855        let cid2 = test_cid(1);
856        let vec2 = vec![0.9, 0.1, 0.0, 0.0];
857        let meta2 = Metadata::new()
858            .with_string("type", "document")
859            .with_integer("size", 2048);
860
861        let cid3 = test_cid(2);
862        let vec3 = vec![0.0, 1.0, 0.0, 0.0];
863        let meta3 = Metadata::new()
864            .with_string("type", "audio")
865            .with_integer("size", 512);
866
867        index.insert(&cid1, &vec1, Some(meta1)).unwrap();
868        index.insert(&cid2, &vec2, Some(meta2)).unwrap();
869        index.insert(&cid3, &vec3, Some(meta3)).unwrap();
870
871        // Simple k-NN search with explicit ef_search to ensure results are found
872        let mut query = HybridQuery::knn(vec![1.0, 0.0, 0.0, 0.0], 2);
873        query.ef_search = Some(50); // Ensure we search enough candidates
874        let response = index.search(query).await.unwrap();
875
876        assert!(
877            !response.results.is_empty(),
878            "Expected at least 1 result, got {}",
879            response.results.len()
880        );
881        // With 3 vectors and k=2, we should get 2 results
882        assert!(
883            response.results.len() >= 1 && response.results.len() <= 2,
884            "Expected 1-2 results, got {}",
885            response.results.len()
886        );
887        // First result should be exact match (cid1)
888        assert_eq!(response.results[0].cid, cid1);
889    }
890
891    #[tokio::test]
892    async fn test_filtered_search() {
893        let config = HybridConfig {
894            dimension: 4,
895            ..Default::default()
896        };
897
898        let index = HybridIndex::new(config).unwrap();
899
900        let cid1 = test_cid(0);
901        let vec1 = vec![1.0, 0.0, 0.0, 0.0];
902        let meta1 = Metadata::new().with_string("category", "tech");
903
904        let cid2 = test_cid(1);
905        let vec2 = vec![0.9, 0.1, 0.0, 0.0];
906        let meta2 = Metadata::new().with_string("category", "science");
907
908        index.insert(&cid1, &vec1, Some(meta1)).unwrap();
909        index.insert(&cid2, &vec2, Some(meta2)).unwrap();
910
911        // Search with filter
912        let filter = MetadataFilter::eq("category", MetadataValue::String("tech".to_string()));
913        let query = HybridQuery::knn(vec![0.9, 0.1, 0.0, 0.0], 10).with_filter(filter);
914        let response = index.search(query).await.unwrap();
915
916        // Should only return tech category
917        assert_eq!(response.results.len(), 1);
918        assert_eq!(response.results[0].cid, cid1);
919    }
920
921    #[tokio::test]
922    async fn test_search_with_metadata() {
923        let config = HybridConfig {
924            dimension: 4,
925            ..Default::default()
926        };
927
928        let index = HybridIndex::new(config).unwrap();
929
930        let cid1 = test_cid(0);
931        let vec1 = vec![1.0, 0.0, 0.0, 0.0];
932        let meta1 = Metadata::new().with_string("title", "Test Document");
933
934        index.insert(&cid1, &vec1, Some(meta1)).unwrap();
935
936        let query = HybridQuery::knn(vec![1.0, 0.0, 0.0, 0.0], 1).with_metadata();
937        let response = index.search(query).await.unwrap();
938
939        assert_eq!(response.results.len(), 1);
940        assert!(response.results[0].metadata.is_some());
941
942        let meta = response.results[0].metadata.as_ref().unwrap();
943        assert_eq!(
944            meta.get("title"),
945            Some(&MetadataValue::String("Test Document".to_string()))
946        );
947    }
948
949    #[test]
950    fn test_health_and_stats() {
951        let config = HybridConfig {
952            dimension: 4,
953            ..Default::default()
954        };
955
956        let index = HybridIndex::new(config).unwrap();
957
958        let health = index.health();
959        assert_eq!(health.size, 0);
960
961        let stats = index.stats();
962        assert_eq!(stats.search_count, 0);
963    }
964
965    #[test]
966    fn test_pruning_to_max_entries() {
967        let config = HybridConfig {
968            dimension: 4,
969            ..Default::default()
970        };
971
972        let index = HybridIndex::new(config).unwrap();
973
974        // Insert 3 entries
975        for i in 0..3 {
976            let cid = test_cid(i);
977            let vec = vec![i as f32, 0.0, 0.0, 0.0];
978            let meta = Metadata::new().with_integer("order", i as i64);
979            index.insert(&cid, &vec, Some(meta)).unwrap();
980        }
981
982        assert_eq!(index.len(), 3);
983
984        // Prune to max 2 entries
985        let pruned = index.prune_to_max_entries(2).unwrap();
986        assert_eq!(pruned, 1);
987        assert_eq!(index.len(), 2);
988    }
989
990    #[test]
991    fn test_pruning_stats() {
992        let config = HybridConfig {
993            dimension: 4,
994            ..Default::default()
995        };
996
997        let index = HybridIndex::new(config).unwrap();
998
999        // Insert some entries
1000        for i in 0..3 {
1001            let cid = test_cid(i);
1002            let vec = vec![i as f32, 0.0, 0.0, 0.0];
1003            index.insert(&cid, &vec, None).unwrap();
1004        }
1005
1006        let stats = index.pruning_stats();
1007        assert_eq!(stats.total_entries, 3);
1008        // All entries should be recent (created just now)
1009        assert_eq!(stats.entries_last_day, 3);
1010        assert_eq!(stats.entries_last_week, 3);
1011    }
1012}