Skip to main content

allsource_core/infrastructure/search/
hybrid_search_engine.rs

1//! HybridSearchEngine - Combined semantic and keyword search orchestrator
2//!
3//! This module provides:
4//! - Unified search interface combining VectorSearchEngine (semantic) and KeywordSearchEngine (keyword)
5//! - Score combination and re-ranking using Reciprocal Rank Fusion (RRF)
6//! - Metadata filtering (event_type, entity_id, time range)
7//! - Configurable search modes (semantic-only, keyword-only, hybrid)
8
9use crate::error::Result;
10use chrono::{DateTime, Utc};
11use serde::{Deserialize, Serialize};
12use std::{collections::HashMap, sync::Arc};
13use uuid::Uuid;
14
15use super::{
16    KeywordQuery, KeywordSearchEngine, KeywordSearchResult, SimilarityQuery, SimilarityResult,
17    VectorSearchEngine,
18};
19
20/// Configuration for the HybridSearchEngine
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct HybridSearchEngineConfig {
23    /// Weight for semantic search scores (0.0 to 1.0)
24    pub semantic_weight: f32,
25    /// Weight for keyword search scores (0.0 to 1.0)
26    pub keyword_weight: f32,
27    /// RRF constant k (typically 60)
28    pub rrf_k: f32,
29    /// Default number of results to return
30    pub default_limit: usize,
31    /// Minimum score threshold for final results (0.0 to 1.0)
32    pub min_score_threshold: f32,
33    /// Over-fetch multiplier for filtering (fetch N * multiplier before filtering)
34    pub over_fetch_multiplier: usize,
35}
36
37impl Default for HybridSearchEngineConfig {
38    fn default() -> Self {
39        Self {
40            semantic_weight: 0.5,
41            keyword_weight: 0.5,
42            rrf_k: 60.0,
43            default_limit: 10,
44            min_score_threshold: 0.0,
45            over_fetch_multiplier: 3,
46        }
47    }
48}
49
50/// Search mode for the hybrid search
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
52pub enum SearchMode {
53    /// Use only semantic (vector) search
54    SemanticOnly,
55    /// Use only keyword (BM25) search
56    KeywordOnly,
57    /// Combine both semantic and keyword search (default)
58    #[default]
59    Hybrid,
60}
61
62/// Metadata filters for search queries
63#[derive(Debug, Clone, Default, Serialize, Deserialize)]
64pub struct MetadataFilters {
65    /// Filter by event type (exact match)
66    pub event_type: Option<String>,
67    /// Filter by entity ID (exact match)
68    pub entity_id: Option<String>,
69    /// Filter by start time (events on or after this time)
70    pub time_from: Option<DateTime<Utc>>,
71    /// Filter by end time (events on or before this time)
72    pub time_to: Option<DateTime<Utc>>,
73}
74
75impl MetadataFilters {
76    pub fn new() -> Self {
77        Self::default()
78    }
79
80    pub fn with_event_type(mut self, event_type: impl Into<String>) -> Self {
81        self.event_type = Some(event_type.into());
82        self
83    }
84
85    pub fn with_entity_id(mut self, entity_id: impl Into<String>) -> Self {
86        self.entity_id = Some(entity_id.into());
87        self
88    }
89
90    pub fn with_time_range(mut self, from: DateTime<Utc>, to: DateTime<Utc>) -> Self {
91        self.time_from = Some(from);
92        self.time_to = Some(to);
93        self
94    }
95
96    pub fn with_time_from(mut self, from: DateTime<Utc>) -> Self {
97        self.time_from = Some(from);
98        self
99    }
100
101    pub fn with_time_to(mut self, to: DateTime<Utc>) -> Self {
102        self.time_to = Some(to);
103        self
104    }
105
106    /// Check if any filters are set
107    pub fn has_filters(&self) -> bool {
108        self.event_type.is_some()
109            || self.entity_id.is_some()
110            || self.time_from.is_some()
111            || self.time_to.is_some()
112    }
113}
114
115/// Unified search query combining all search options
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct SearchQuery {
118    /// The search query string (natural language or keywords)
119    pub query: String,
120    /// Maximum number of results to return
121    pub limit: usize,
122    /// Optional tenant filter for multi-tenancy
123    pub tenant_id: Option<String>,
124    /// Search mode (semantic, keyword, or hybrid)
125    pub mode: SearchMode,
126    /// Metadata filters
127    pub filters: MetadataFilters,
128    /// Minimum similarity threshold for semantic search (0.0 to 1.0)
129    pub min_similarity: Option<f32>,
130}
131
132impl SearchQuery {
133    pub fn new(query: impl Into<String>) -> Self {
134        Self {
135            query: query.into(),
136            limit: 10,
137            tenant_id: None,
138            mode: SearchMode::Hybrid,
139            filters: MetadataFilters::default(),
140            min_similarity: None,
141        }
142    }
143
144    pub fn with_limit(mut self, limit: usize) -> Self {
145        self.limit = limit;
146        self
147    }
148
149    pub fn with_tenant(mut self, tenant_id: impl Into<String>) -> Self {
150        self.tenant_id = Some(tenant_id.into());
151        self
152    }
153
154    pub fn with_mode(mut self, mode: SearchMode) -> Self {
155        self.mode = mode;
156        self
157    }
158
159    pub fn with_filters(mut self, filters: MetadataFilters) -> Self {
160        self.filters = filters;
161        self
162    }
163
164    pub fn with_event_type(mut self, event_type: impl Into<String>) -> Self {
165        self.filters.event_type = Some(event_type.into());
166        self
167    }
168
169    pub fn with_entity_id(mut self, entity_id: impl Into<String>) -> Self {
170        self.filters.entity_id = Some(entity_id.into());
171        self
172    }
173
174    pub fn with_time_range(mut self, from: DateTime<Utc>, to: DateTime<Utc>) -> Self {
175        self.filters.time_from = Some(from);
176        self.filters.time_to = Some(to);
177        self
178    }
179
180    pub fn with_min_similarity(mut self, threshold: f32) -> Self {
181        self.min_similarity = Some(threshold);
182        self
183    }
184
185    /// Create a semantic-only search query
186    pub fn semantic(query: impl Into<String>) -> Self {
187        Self::new(query).with_mode(SearchMode::SemanticOnly)
188    }
189
190    /// Create a keyword-only search query
191    pub fn keyword(query: impl Into<String>) -> Self {
192        Self::new(query).with_mode(SearchMode::KeywordOnly)
193    }
194}
195
196/// Result of a hybrid search
197#[derive(Debug, Clone, Serialize, Deserialize)]
198pub struct HybridSearchResult {
199    /// Event ID of the matching event
200    pub event_id: Uuid,
201    /// Combined score (0.0 to 1.0)
202    pub score: f32,
203    /// Individual semantic similarity score (if available)
204    pub semantic_score: Option<f32>,
205    /// Individual keyword relevance score (if available)
206    pub keyword_score: Option<f32>,
207    /// Event type (from keyword search if available)
208    pub event_type: Option<String>,
209    /// Entity ID (from keyword search if available)
210    pub entity_id: Option<String>,
211    /// Source text snippet (from vector search if available)
212    pub source_text: Option<String>,
213}
214
215/// Event metadata for filtering (stored alongside indexed data)
216#[derive(Debug, Clone, Default)]
217pub struct EventMetadata {
218    pub event_type: Option<String>,
219    pub entity_id: Option<String>,
220    pub timestamp: Option<DateTime<Utc>>,
221}
222
223/// HybridSearchEngine - Orchestrates semantic and keyword search
224///
225/// Features:
226/// - Combines VectorSearchEngine (semantic) and KeywordSearchEngine (keyword)
227/// - Score fusion using Reciprocal Rank Fusion (RRF)
228/// - Metadata filtering for event_type, entity_id, and time range
229/// - Configurable search modes and weights
230pub struct HybridSearchEngine {
231    config: HybridSearchEngineConfig,
232    vector_engine: Arc<VectorSearchEngine>,
233    keyword_engine: Arc<KeywordSearchEngine>,
234    /// Event metadata cache for filtering
235    metadata_cache: parking_lot::RwLock<HashMap<Uuid, EventMetadata>>,
236    /// Statistics
237    stats: parking_lot::RwLock<EngineStats>,
238}
239
240#[derive(Debug, Default, Clone)]
241struct EngineStats {
242    total_searches: u64,
243    semantic_searches: u64,
244    keyword_searches: u64,
245    hybrid_searches: u64,
246}
247
248impl HybridSearchEngine {
249    /// Create a new HybridSearchEngine with existing engines
250    pub fn new(
251        vector_engine: Arc<VectorSearchEngine>,
252        keyword_engine: Arc<KeywordSearchEngine>,
253    ) -> Self {
254        Self::with_config(
255            vector_engine,
256            keyword_engine,
257            HybridSearchEngineConfig::default(),
258        )
259    }
260
261    /// Create a new HybridSearchEngine with custom configuration
262    pub fn with_config(
263        vector_engine: Arc<VectorSearchEngine>,
264        keyword_engine: Arc<KeywordSearchEngine>,
265        config: HybridSearchEngineConfig,
266    ) -> Self {
267        Self {
268            config,
269            vector_engine,
270            keyword_engine,
271            metadata_cache: parking_lot::RwLock::new(HashMap::new()),
272            stats: parking_lot::RwLock::new(EngineStats::default()),
273        }
274    }
275
276    /// Get the engine configuration
277    pub fn config(&self) -> &HybridSearchEngineConfig {
278        &self.config
279    }
280
281    /// Store metadata for an event (for post-search filtering)
282    pub fn store_metadata(&self, event_id: Uuid, metadata: EventMetadata) {
283        let mut cache = self.metadata_cache.write();
284        cache.insert(event_id, metadata);
285    }
286
287    /// Index an event in both engines
288    ///
289    /// This is a convenience method that indexes the event in both
290    /// the vector and keyword engines, and stores metadata for filtering.
291    #[cfg(any(feature = "vector-search", feature = "keyword-search"))]
292    pub async fn index_event(
293        &self,
294        event_id: Uuid,
295        tenant_id: &str,
296        event_type: &str,
297        entity_id: Option<&str>,
298        payload: &serde_json::Value,
299        timestamp: DateTime<Utc>,
300    ) -> Result<()> {
301        // Store metadata for filtering
302        self.store_metadata(
303            event_id,
304            EventMetadata {
305                event_type: Some(event_type.to_string()),
306                entity_id: entity_id.map(std::string::ToString::to_string),
307                timestamp: Some(timestamp),
308            },
309        );
310
311        // Index in keyword engine
312        #[cfg(feature = "keyword-search")]
313        self.keyword_engine
314            .index_event(event_id, tenant_id, event_type, entity_id, payload)?;
315
316        // Index in vector engine (requires embedding generation)
317        #[cfg(feature = "vector-search")]
318        {
319            let embedding = self.vector_engine.embed_event(payload)?;
320            let source_text = extract_source_text(payload);
321            self.vector_engine
322                .index_event(event_id, tenant_id, embedding, source_text)
323                .await?;
324        }
325
326        Ok(())
327    }
328
329    /// Stub for when features are not enabled
330    #[cfg(not(any(feature = "vector-search", feature = "keyword-search")))]
331    pub async fn index_event(
332        &self,
333        event_id: Uuid,
334        _tenant_id: &str,
335        event_type: &str,
336        entity_id: Option<&str>,
337        _payload: &serde_json::Value,
338        timestamp: DateTime<Utc>,
339    ) -> Result<()> {
340        // Store metadata for filtering
341        self.store_metadata(
342            event_id,
343            EventMetadata {
344                event_type: Some(event_type.to_string()),
345                entity_id: entity_id.map(|s| s.to_string()),
346                timestamp: Some(timestamp),
347            },
348        );
349        Ok(())
350    }
351
352    /// Commit changes to both engines
353    #[cfg(feature = "keyword-search")]
354    pub fn commit(&self) -> Result<()> {
355        self.keyword_engine.commit()
356    }
357
358    #[cfg(not(feature = "keyword-search"))]
359    pub fn commit(&self) -> Result<()> {
360        Ok(())
361    }
362
363    /// Perform a search using the configured mode
364    ///
365    /// This is the main entry point for searching. It:
366    /// 1. Runs semantic search (if natural language query and mode allows)
367    /// 2. Runs keyword search (if keywords present and mode allows)
368    /// 3. Combines and re-ranks scores using RRF
369    /// 4. Applies metadata filters
370    /// 5. Returns top-k results
371    pub fn search(&self, query: &SearchQuery) -> Result<Vec<HybridSearchResult>> {
372        // Update stats
373        {
374            let mut stats = self.stats.write();
375            stats.total_searches += 1;
376            match query.mode {
377                SearchMode::SemanticOnly => stats.semantic_searches += 1,
378                SearchMode::KeywordOnly => stats.keyword_searches += 1,
379                SearchMode::Hybrid => stats.hybrid_searches += 1,
380            }
381        }
382
383        // Calculate how many results to fetch (over-fetch for filtering)
384        let fetch_limit = if query.filters.has_filters() {
385            query.limit * self.config.over_fetch_multiplier
386        } else {
387            query.limit
388        };
389
390        match query.mode {
391            SearchMode::SemanticOnly => self.search_semantic_only(query, fetch_limit),
392            SearchMode::KeywordOnly => self.search_keyword_only(query, fetch_limit),
393            SearchMode::Hybrid => self.search_hybrid(query, fetch_limit),
394        }
395    }
396
397    /// Search using only semantic (vector) search
398    fn search_semantic_only(
399        &self,
400        query: &SearchQuery,
401        fetch_limit: usize,
402    ) -> Result<Vec<HybridSearchResult>> {
403        let semantic_results = self.run_semantic_search(query, fetch_limit)?;
404
405        let mut results: Vec<HybridSearchResult> = semantic_results
406            .into_iter()
407            .map(|r| HybridSearchResult {
408                event_id: r.event_id,
409                score: r.score,
410                semantic_score: Some(r.score),
411                keyword_score: None,
412                event_type: self.get_cached_event_type(r.event_id),
413                entity_id: self.get_cached_entity_id(r.event_id),
414                source_text: r.source_text,
415            })
416            .collect();
417
418        // Apply metadata filters
419        self.apply_filters(&mut results, &query.filters);
420
421        // Apply minimum score threshold
422        results.retain(|r| r.score >= self.config.min_score_threshold);
423
424        // Truncate to requested limit
425        results.truncate(query.limit);
426
427        Ok(results)
428    }
429
430    /// Search using only keyword (BM25) search
431    fn search_keyword_only(
432        &self,
433        query: &SearchQuery,
434        fetch_limit: usize,
435    ) -> Result<Vec<HybridSearchResult>> {
436        let keyword_results = self.run_keyword_search(query, fetch_limit)?;
437
438        // Normalize keyword scores to 0-1 range
439        let max_score = keyword_results
440            .iter()
441            .map(|r| r.score)
442            .fold(0.0_f32, f32::max);
443
444        let mut results: Vec<HybridSearchResult> = keyword_results
445            .into_iter()
446            .map(|r| {
447                let normalized_score = if max_score > 0.0 {
448                    r.score / max_score
449                } else {
450                    0.0
451                };
452                HybridSearchResult {
453                    event_id: r.event_id,
454                    score: normalized_score,
455                    semantic_score: None,
456                    keyword_score: Some(r.score),
457                    event_type: Some(r.event_type),
458                    entity_id: r.entity_id,
459                    source_text: None,
460                }
461            })
462            .collect();
463
464        // Apply metadata filters
465        self.apply_filters(&mut results, &query.filters);
466
467        // Apply minimum score threshold
468        results.retain(|r| r.score >= self.config.min_score_threshold);
469
470        // Truncate to requested limit
471        results.truncate(query.limit);
472
473        Ok(results)
474    }
475
476    /// Search using hybrid (combined) search with RRF score fusion
477    fn search_hybrid(
478        &self,
479        query: &SearchQuery,
480        fetch_limit: usize,
481    ) -> Result<Vec<HybridSearchResult>> {
482        // Run both searches
483        let semantic_results = self.run_semantic_search(query, fetch_limit)?;
484        let keyword_results = self.run_keyword_search(query, fetch_limit)?;
485
486        // Build score maps with ranks
487        let mut combined_scores: HashMap<Uuid, HybridSearchResult> = HashMap::new();
488
489        // Process semantic results
490        for (rank, result) in semantic_results.into_iter().enumerate() {
491            let rrf_score = 1.0 / (self.config.rrf_k + rank as f32 + 1.0);
492            let weighted_score = rrf_score * self.config.semantic_weight;
493
494            combined_scores.insert(
495                result.event_id,
496                HybridSearchResult {
497                    event_id: result.event_id,
498                    score: weighted_score,
499                    semantic_score: Some(result.score),
500                    keyword_score: None,
501                    event_type: self.get_cached_event_type(result.event_id),
502                    entity_id: self.get_cached_entity_id(result.event_id),
503                    source_text: result.source_text,
504                },
505            );
506        }
507
508        // Process keyword results and combine with semantic
509        for (rank, result) in keyword_results.into_iter().enumerate() {
510            let rrf_score = 1.0 / (self.config.rrf_k + rank as f32 + 1.0);
511            let weighted_score = rrf_score * self.config.keyword_weight;
512
513            if let Some(existing) = combined_scores.get_mut(&result.event_id) {
514                // Combine scores
515                existing.score += weighted_score;
516                existing.keyword_score = Some(result.score);
517                // Prefer keyword engine's event_type and entity_id as they're stored
518                existing.event_type = Some(result.event_type);
519                existing.entity_id = result.entity_id;
520            } else {
521                combined_scores.insert(
522                    result.event_id,
523                    HybridSearchResult {
524                        event_id: result.event_id,
525                        score: weighted_score,
526                        semantic_score: None,
527                        keyword_score: Some(result.score),
528                        event_type: Some(result.event_type),
529                        entity_id: result.entity_id,
530                        source_text: None,
531                    },
532                );
533            }
534        }
535
536        // Convert to sorted vector
537        let mut results: Vec<HybridSearchResult> = combined_scores.into_values().collect();
538        results.sort_by(|a, b| {
539            b.score
540                .partial_cmp(&a.score)
541                .unwrap_or(std::cmp::Ordering::Equal)
542        });
543
544        // Apply metadata filters
545        self.apply_filters(&mut results, &query.filters);
546
547        // Apply minimum score threshold
548        results.retain(|r| r.score >= self.config.min_score_threshold);
549
550        // Truncate to requested limit
551        results.truncate(query.limit);
552
553        Ok(results)
554    }
555
556    /// Run semantic search
557    fn run_semantic_search(
558        &self,
559        query: &SearchQuery,
560        limit: usize,
561    ) -> Result<Vec<SimilarityResult>> {
562        // Generate query embedding
563        let query_embedding = self.vector_engine.embed_text(&query.query)?;
564
565        let similarity_query = SimilarityQuery::new(query_embedding, limit)
566            .with_min_similarity(query.min_similarity.unwrap_or(0.0));
567
568        let similarity_query = if let Some(ref tenant_id) = query.tenant_id {
569            similarity_query.with_tenant(tenant_id.clone())
570        } else {
571            similarity_query
572        };
573
574        self.vector_engine.search_similar(&similarity_query)
575    }
576
577    /// Run keyword search
578    fn run_keyword_search(
579        &self,
580        query: &SearchQuery,
581        limit: usize,
582    ) -> Result<Vec<KeywordSearchResult>> {
583        let keyword_query = KeywordQuery::new(&query.query).with_limit(limit);
584
585        let keyword_query = if let Some(ref tenant_id) = query.tenant_id {
586            keyword_query.with_tenant(tenant_id)
587        } else {
588            keyword_query
589        };
590
591        self.keyword_engine.search_keywords(&keyword_query)
592    }
593
594    /// Apply metadata filters to results
595    fn apply_filters(&self, results: &mut Vec<HybridSearchResult>, filters: &MetadataFilters) {
596        if !filters.has_filters() {
597            return;
598        }
599
600        let metadata_cache = self.metadata_cache.read();
601
602        results.retain(|result| {
603            // Get cached metadata
604            let cached = metadata_cache.get(&result.event_id);
605
606            // Filter by event_type
607            if let Some(ref filter_type) = filters.event_type {
608                let event_type = result
609                    .event_type
610                    .as_ref()
611                    .or_else(|| cached.and_then(|m| m.event_type.as_ref()));
612                match event_type {
613                    Some(t) if t == filter_type => {}
614                    _ => return false,
615                }
616            }
617
618            // Filter by entity_id
619            if let Some(ref filter_entity) = filters.entity_id {
620                let entity_id = result
621                    .entity_id
622                    .as_ref()
623                    .or_else(|| cached.and_then(|m| m.entity_id.as_ref()));
624                match entity_id {
625                    Some(e) if e == filter_entity => {}
626                    _ => return false,
627                }
628            }
629
630            // Filter by time range
631            if filters.time_from.is_some() || filters.time_to.is_some() {
632                if let Some(timestamp) = cached.and_then(|m| m.timestamp) {
633                    if let Some(from) = filters.time_from
634                        && timestamp < from
635                    {
636                        return false;
637                    }
638                    if let Some(to) = filters.time_to
639                        && timestamp > to
640                    {
641                        return false;
642                    }
643                } else {
644                    // No timestamp available, exclude if time filter is required
645                    return false;
646                }
647            }
648
649            true
650        });
651    }
652
653    /// Get cached event type for an event
654    fn get_cached_event_type(&self, event_id: Uuid) -> Option<String> {
655        self.metadata_cache
656            .read()
657            .get(&event_id)
658            .and_then(|m| m.event_type.clone())
659    }
660
661    /// Get cached entity ID for an event
662    fn get_cached_entity_id(&self, event_id: Uuid) -> Option<String> {
663        self.metadata_cache
664            .read()
665            .get(&event_id)
666            .and_then(|m| m.entity_id.clone())
667    }
668
669    /// Get engine statistics
670    pub fn stats(&self) -> (u64, u64, u64, u64) {
671        let stats = self.stats.read();
672        (
673            stats.total_searches,
674            stats.semantic_searches,
675            stats.keyword_searches,
676            stats.hybrid_searches,
677        )
678    }
679
680    /// Health check for both engines
681    pub fn health_check(&self) -> Result<()> {
682        self.vector_engine.health_check()?;
683        self.keyword_engine.health_check()?;
684        Ok(())
685    }
686
687    /// Get the number of events in the metadata cache
688    pub fn cached_metadata_count(&self) -> usize {
689        self.metadata_cache.read().len()
690    }
691
692    /// Clear the metadata cache
693    pub fn clear_metadata_cache(&self) {
694        self.metadata_cache.write().clear();
695    }
696}
697
698/// Extract source text from a payload for the vector search engine
699#[cfg(feature = "vector-search")]
700fn extract_source_text(payload: &serde_json::Value) -> Option<String> {
701    let priority_fields = [
702        "content",
703        "text",
704        "body",
705        "message",
706        "description",
707        "title",
708        "name",
709    ];
710
711    if let serde_json::Value::Object(map) = payload {
712        for field in priority_fields {
713            if let Some(serde_json::Value::String(s)) = map.get(field)
714                && !s.is_empty()
715            {
716                return Some(s.clone());
717            }
718        }
719    }
720
721    // Fallback: convert entire payload to string
722    Some(payload.to_string())
723}
724
725#[cfg(test)]
726mod tests {
727    use super::*;
728
729    fn create_test_config() -> HybridSearchEngineConfig {
730        HybridSearchEngineConfig {
731            semantic_weight: 0.5,
732            keyword_weight: 0.5,
733            rrf_k: 60.0,
734            default_limit: 10,
735            min_score_threshold: 0.0,
736            over_fetch_multiplier: 3,
737        }
738    }
739
740    #[test]
741    fn test_config_default() {
742        let config = HybridSearchEngineConfig::default();
743        assert_eq!(config.semantic_weight, 0.5);
744        assert_eq!(config.keyword_weight, 0.5);
745        assert_eq!(config.rrf_k, 60.0);
746        assert_eq!(config.default_limit, 10);
747    }
748
749    #[test]
750    fn test_search_query_builder() {
751        let query = SearchQuery::new("test query")
752            .with_limit(20)
753            .with_tenant("tenant-1")
754            .with_mode(SearchMode::Hybrid)
755            .with_event_type("UserCreated")
756            .with_entity_id("user-123")
757            .with_min_similarity(0.7);
758
759        assert_eq!(query.query, "test query");
760        assert_eq!(query.limit, 20);
761        assert_eq!(query.tenant_id, Some("tenant-1".to_string()));
762        assert_eq!(query.mode, SearchMode::Hybrid);
763        assert_eq!(query.filters.event_type, Some("UserCreated".to_string()));
764        assert_eq!(query.filters.entity_id, Some("user-123".to_string()));
765        assert_eq!(query.min_similarity, Some(0.7));
766    }
767
768    #[test]
769    fn test_search_query_semantic_constructor() {
770        let query = SearchQuery::semantic("natural language query");
771        assert_eq!(query.mode, SearchMode::SemanticOnly);
772    }
773
774    #[test]
775    fn test_search_query_keyword_constructor() {
776        let query = SearchQuery::keyword("keyword search");
777        assert_eq!(query.mode, SearchMode::KeywordOnly);
778    }
779
780    #[test]
781    fn test_metadata_filters_builder() {
782        let now = Utc::now();
783        let past = now - chrono::Duration::hours(1);
784
785        let filters = MetadataFilters::new()
786            .with_event_type("OrderPlaced")
787            .with_entity_id("order-456")
788            .with_time_range(past, now);
789
790        assert_eq!(filters.event_type, Some("OrderPlaced".to_string()));
791        assert_eq!(filters.entity_id, Some("order-456".to_string()));
792        assert!(filters.time_from.is_some());
793        assert!(filters.time_to.is_some());
794        assert!(filters.has_filters());
795    }
796
797    #[test]
798    fn test_metadata_filters_has_filters() {
799        let empty = MetadataFilters::new();
800        assert!(!empty.has_filters());
801
802        let with_type = MetadataFilters::new().with_event_type("Test");
803        assert!(with_type.has_filters());
804
805        let with_entity = MetadataFilters::new().with_entity_id("e1");
806        assert!(with_entity.has_filters());
807
808        let with_time = MetadataFilters::new().with_time_from(Utc::now());
809        assert!(with_time.has_filters());
810    }
811
812    #[test]
813    fn test_search_mode_default() {
814        let mode: SearchMode = Default::default();
815        assert_eq!(mode, SearchMode::Hybrid);
816    }
817
818    #[test]
819    fn test_hybrid_search_result_structure() {
820        let result = HybridSearchResult {
821            event_id: Uuid::new_v4(),
822            score: 0.85,
823            semantic_score: Some(0.9),
824            keyword_score: Some(0.8),
825            event_type: Some("UserCreated".to_string()),
826            entity_id: Some("user-123".to_string()),
827            source_text: Some("Test content".to_string()),
828        };
829
830        assert!(result.score > 0.0);
831        assert!(result.semantic_score.is_some());
832        assert!(result.keyword_score.is_some());
833    }
834
835    #[test]
836    fn test_event_metadata_default() {
837        let metadata = EventMetadata::default();
838        assert!(metadata.event_type.is_none());
839        assert!(metadata.entity_id.is_none());
840        assert!(metadata.timestamp.is_none());
841    }
842
843    #[test]
844    fn test_rrf_score_calculation() {
845        // Verify RRF formula: 1 / (k + rank + 1)
846        let k = 60.0_f32;
847
848        // Rank 0 (first result)
849        let score_rank_0 = 1.0 / (k + 0.0 + 1.0);
850        assert!((score_rank_0 - 0.01639344).abs() < 0.0001);
851
852        // Rank 1 (second result)
853        let score_rank_1 = 1.0 / (k + 1.0 + 1.0);
854        assert!((score_rank_1 - 0.01612903).abs() < 0.0001);
855
856        // Verify rank 0 > rank 1
857        assert!(score_rank_0 > score_rank_1);
858    }
859
860    #[test]
861    fn test_config_weights_validation() {
862        let config = create_test_config();
863
864        // Weights should sum to 1.0 for balanced hybrid search
865        assert!((config.semantic_weight + config.keyword_weight - 1.0).abs() < f32::EPSILON);
866    }
867
868    #[test]
869    fn test_search_query_with_time_range() {
870        let now = Utc::now();
871        let past = now - chrono::Duration::days(7);
872
873        let query = SearchQuery::new("test").with_time_range(past, now);
874
875        assert_eq!(query.filters.time_from, Some(past));
876        assert_eq!(query.filters.time_to, Some(now));
877    }
878
879    #[test]
880    fn test_search_query_serialization() {
881        let query = SearchQuery::new("test query")
882            .with_limit(5)
883            .with_tenant("tenant-1")
884            .with_mode(SearchMode::Hybrid);
885
886        // Should serialize without error
887        let json = serde_json::to_string(&query);
888        assert!(json.is_ok());
889
890        // Should deserialize without error
891        let deserialized: std::result::Result<SearchQuery, _> =
892            serde_json::from_str(&json.unwrap());
893        assert!(deserialized.is_ok());
894
895        let deserialized = deserialized.unwrap();
896        assert_eq!(deserialized.query, "test query");
897        assert_eq!(deserialized.limit, 5);
898    }
899
900    #[test]
901    fn test_hybrid_search_result_serialization() {
902        let result = HybridSearchResult {
903            event_id: Uuid::new_v4(),
904            score: 0.85,
905            semantic_score: Some(0.9),
906            keyword_score: Some(0.8),
907            event_type: Some("Test".to_string()),
908            entity_id: None,
909            source_text: None,
910        };
911
912        let json = serde_json::to_string(&result);
913        assert!(json.is_ok());
914    }
915}
916
917/// Integration tests that require both search features enabled
918#[cfg(test)]
919#[cfg(all(feature = "vector-search", feature = "keyword-search"))]
920mod integration_tests {
921    use super::*;
922    use crate::infrastructure::search::{KeywordSearchEngineConfig, VectorSearchEngineConfig};
923
924    fn create_test_engines() -> (Arc<VectorSearchEngine>, Arc<KeywordSearchEngine>) {
925        let vector_engine = Arc::new(
926            VectorSearchEngine::with_config(VectorSearchEngineConfig {
927                default_similarity_threshold: 0.0,
928                ..Default::default()
929            })
930            .unwrap(),
931        );
932
933        let keyword_engine = Arc::new(
934            KeywordSearchEngine::with_config(KeywordSearchEngineConfig {
935                auto_commit: true,
936                ..Default::default()
937            })
938            .unwrap(),
939        );
940
941        (vector_engine, keyword_engine)
942    }
943
944    #[tokio::test]
945    async fn test_hybrid_search_integration() {
946        let (vector_engine, keyword_engine) = create_test_engines();
947        let hybrid_engine = HybridSearchEngine::new(vector_engine, keyword_engine);
948
949        let id1 = Uuid::new_v4();
950        let id2 = Uuid::new_v4();
951
952        // Index events
953        hybrid_engine
954            .index_event(
955                id1,
956                "tenant-1",
957                "UserCreated",
958                Some("user-123"),
959                &serde_json::json!({"name": "Alice", "email": "alice@example.com"}),
960                Utc::now(),
961            )
962            .await
963            .unwrap();
964
965        hybrid_engine
966            .index_event(
967                id2,
968                "tenant-1",
969                "OrderPlaced",
970                Some("order-456"),
971                &serde_json::json!({"product": "Widget", "quantity": 5}),
972                Utc::now(),
973            )
974            .await
975            .unwrap();
976
977        // Hybrid search
978        let query = SearchQuery::new("Alice user").with_tenant("tenant-1");
979        let results = hybrid_engine.search(&query).unwrap();
980
981        assert!(!results.is_empty());
982        // Alice result should be first
983        assert_eq!(results[0].event_id, id1);
984    }
985
986    #[tokio::test]
987    async fn test_search_with_event_type_filter() {
988        let (vector_engine, keyword_engine) = create_test_engines();
989        let hybrid_engine = HybridSearchEngine::new(vector_engine, keyword_engine);
990
991        let id1 = Uuid::new_v4();
992        let id2 = Uuid::new_v4();
993
994        hybrid_engine
995            .index_event(
996                id1,
997                "tenant-1",
998                "UserCreated",
999                None,
1000                &serde_json::json!({"data": "test user"}),
1001                Utc::now(),
1002            )
1003            .await
1004            .unwrap();
1005
1006        hybrid_engine
1007            .index_event(
1008                id2,
1009                "tenant-1",
1010                "OrderPlaced",
1011                None,
1012                &serde_json::json!({"data": "test order"}),
1013                Utc::now(),
1014            )
1015            .await
1016            .unwrap();
1017
1018        // Search with event_type filter
1019        let query = SearchQuery::new("test")
1020            .with_tenant("tenant-1")
1021            .with_event_type("UserCreated");
1022        let results = hybrid_engine.search(&query).unwrap();
1023
1024        assert_eq!(results.len(), 1);
1025        assert_eq!(results[0].event_type, Some("UserCreated".to_string()));
1026    }
1027
1028    #[tokio::test]
1029    async fn test_search_with_time_range_filter() {
1030        let (vector_engine, keyword_engine) = create_test_engines();
1031        let hybrid_engine = HybridSearchEngine::new(vector_engine, keyword_engine);
1032
1033        let now = Utc::now();
1034        let past = now - chrono::Duration::hours(2);
1035        let recent = now - chrono::Duration::minutes(30);
1036
1037        let id1 = Uuid::new_v4();
1038        let id2 = Uuid::new_v4();
1039
1040        // Old event
1041        hybrid_engine
1042            .index_event(
1043                id1,
1044                "tenant-1",
1045                "Event",
1046                None,
1047                &serde_json::json!({"data": "old event"}),
1048                past,
1049            )
1050            .await
1051            .unwrap();
1052
1053        // Recent event
1054        hybrid_engine
1055            .index_event(
1056                id2,
1057                "tenant-1",
1058                "Event",
1059                None,
1060                &serde_json::json!({"data": "recent event"}),
1061                recent,
1062            )
1063            .await
1064            .unwrap();
1065
1066        // Search for events in the last hour
1067        let one_hour_ago = now - chrono::Duration::hours(1);
1068        let query = SearchQuery::new("event")
1069            .with_tenant("tenant-1")
1070            .with_time_range(one_hour_ago, now);
1071        let results = hybrid_engine.search(&query).unwrap();
1072
1073        assert_eq!(results.len(), 1);
1074        assert_eq!(results[0].event_id, id2);
1075    }
1076
1077    #[test]
1078    fn test_health_check() {
1079        let (vector_engine, keyword_engine) = create_test_engines();
1080        let hybrid_engine = HybridSearchEngine::new(vector_engine, keyword_engine);
1081        assert!(hybrid_engine.health_check().is_ok());
1082    }
1083
1084    #[test]
1085    fn test_stats() {
1086        let (vector_engine, keyword_engine) = create_test_engines();
1087        let hybrid_engine = HybridSearchEngine::new(vector_engine, keyword_engine);
1088
1089        let (total, semantic, keyword, hybrid) = hybrid_engine.stats();
1090        assert_eq!(total, 0);
1091        assert_eq!(semantic, 0);
1092        assert_eq!(keyword, 0);
1093        assert_eq!(hybrid, 0);
1094    }
1095}