rlm_rs/search/
mod.rs

1//! Hybrid search with semantic and lexical retrieval.
2//!
3//! Combines vector similarity search with FTS5 BM25 using Reciprocal Rank Fusion (RRF).
4//!
5//! ## Features
6//!
7//! - **Semantic Search**: Vector similarity using embeddings
8//! - **BM25 Search**: Full-text search using `SQLite` `FTS5`
9//! - **Hybrid Search**: Combines both using Reciprocal Rank Fusion
10//! - **HNSW Index**: Optional scalable approximate nearest neighbor search (requires `usearch-hnsw` feature)
11
12pub mod hnsw;
13mod rrf;
14
15pub use hnsw::{HnswConfig, HnswIndex, HnswResult};
16pub use rrf::{RrfConfig, reciprocal_rank_fusion, weighted_rrf};
17
18use crate::embedding::{Embedder, cosine_similarity};
19use crate::error::Result;
20use crate::storage::{SqliteStorage, Storage};
21
22/// Default similarity threshold for semantic search.
23pub const DEFAULT_SIMILARITY_THRESHOLD: f32 = 0.3;
24
25/// Default number of results to return.
26pub const DEFAULT_TOP_K: usize = 10;
27
28/// Search result with chunk ID and combined score.
29#[derive(Debug, Clone)]
30pub struct SearchResult {
31    /// Chunk ID.
32    pub chunk_id: i64,
33    /// Buffer ID this chunk belongs to.
34    pub buffer_id: i64,
35    /// Sequential index within the buffer (0-based, for temporal ordering).
36    pub index: usize,
37    /// Combined RRF score (higher is better).
38    pub score: f64,
39    /// Semantic similarity score (if available).
40    pub semantic_score: Option<f32>,
41    /// BM25 score (if available).
42    pub bm25_score: Option<f64>,
43    /// Content preview (first N characters, if requested).
44    pub content_preview: Option<String>,
45}
46
47/// Configuration for hybrid search.
48#[derive(Debug, Clone)]
49pub struct SearchConfig {
50    /// Maximum number of results to return.
51    pub top_k: usize,
52    /// Minimum similarity threshold for semantic results.
53    pub similarity_threshold: f32,
54    /// RRF k parameter (default 60).
55    pub rrf_k: u32,
56    /// Whether to include semantic search.
57    pub use_semantic: bool,
58    /// Whether to include BM25 search.
59    pub use_bm25: bool,
60}
61
62impl Default for SearchConfig {
63    fn default() -> Self {
64        Self {
65            top_k: DEFAULT_TOP_K,
66            similarity_threshold: DEFAULT_SIMILARITY_THRESHOLD,
67            rrf_k: 60,
68            use_semantic: true,
69            use_bm25: true,
70        }
71    }
72}
73
74/// Default preview length in characters.
75pub const DEFAULT_PREVIEW_LEN: usize = 150;
76
77impl SearchResult {
78    /// Creates a new search result, looking up chunk metadata from storage.
79    ///
80    /// Returns `None` if the chunk cannot be found.
81    fn from_chunk_id(
82        storage: &SqliteStorage,
83        chunk_id: i64,
84        score: f64,
85        semantic_score: Option<f32>,
86        bm25_score: Option<f64>,
87    ) -> Option<Self> {
88        storage
89            .get_chunk(chunk_id)
90            .ok()
91            .flatten()
92            .map(|chunk| Self {
93                chunk_id,
94                buffer_id: chunk.buffer_id,
95                index: chunk.index,
96                score,
97                semantic_score,
98                bm25_score,
99                content_preview: None,
100            })
101    }
102}
103
104/// Populates content previews for search results.
105///
106/// # Arguments
107///
108/// * `storage` - The storage backend.
109/// * `results` - Search results to populate.
110/// * `preview_len` - Maximum preview length in bytes.
111///
112/// # Errors
113///
114/// Returns an error if chunk retrieval fails.
115pub fn populate_previews(
116    storage: &SqliteStorage,
117    results: &mut [SearchResult],
118    preview_len: usize,
119) -> Result<()> {
120    if results.is_empty() {
121        return Ok(());
122    }
123
124    // Batch-fetch all chunks in a single query instead of N individual lookups.
125    let ids: Vec<i64> = results.iter().map(|r| r.chunk_id).collect();
126    let chunk_map = storage.get_chunks_by_ids(&ids)?;
127
128    for result in results.iter_mut() {
129        if let Some(chunk) = chunk_map.get(&result.chunk_id) {
130            let content = &chunk.content;
131            let preview = if content.len() <= preview_len {
132                content.clone()
133            } else {
134                // Find a valid UTF-8 boundary
135                let end = crate::io::find_char_boundary(content, preview_len);
136                let mut preview = content[..end].to_string();
137                if end < content.len() {
138                    preview.push_str("...");
139                }
140                preview
141            };
142            result.content_preview = Some(preview);
143        }
144    }
145    Ok(())
146}
147
148impl SearchConfig {
149    /// Creates a new search config with default values.
150    #[must_use]
151    pub fn new() -> Self {
152        Self::default()
153    }
154
155    /// Sets the top-k limit.
156    #[must_use]
157    pub const fn with_top_k(mut self, top_k: usize) -> Self {
158        self.top_k = top_k;
159        self
160    }
161
162    /// Sets the similarity threshold.
163    #[must_use]
164    pub const fn with_threshold(mut self, threshold: f32) -> Self {
165        self.similarity_threshold = threshold;
166        self
167    }
168
169    /// Sets the RRF k parameter.
170    #[must_use]
171    pub const fn with_rrf_k(mut self, k: u32) -> Self {
172        self.rrf_k = k;
173        self
174    }
175
176    /// Enables or disables semantic search.
177    #[must_use]
178    pub const fn with_semantic(mut self, enabled: bool) -> Self {
179        self.use_semantic = enabled;
180        self
181    }
182
183    /// Enables or disables BM25 search.
184    #[must_use]
185    pub const fn with_bm25(mut self, enabled: bool) -> Self {
186        self.use_bm25 = enabled;
187        self
188    }
189}
190
191/// Performs hybrid search combining semantic and BM25 results.
192///
193/// # Arguments
194///
195/// * `storage` - The storage backend.
196/// * `embedder` - The embedding generator.
197/// * `query` - The search query text.
198/// * `config` - Search configuration.
199///
200/// # Errors
201///
202/// Returns an error if search operations fail.
203pub fn hybrid_search(
204    storage: &SqliteStorage,
205    embedder: &dyn Embedder,
206    query: &str,
207    config: &SearchConfig,
208) -> Result<Vec<SearchResult>> {
209    let mut semantic_results: Vec<(i64, f32)> = Vec::new();
210    let mut bm25_results: Vec<(i64, f64)> = Vec::new();
211
212    // Semantic search
213    if config.use_semantic {
214        semantic_results = semantic_search(storage, embedder, query, config)?;
215    }
216
217    // BM25 search
218    if config.use_bm25 {
219        bm25_results = storage.search_fts(query, config.top_k * 2)?;
220    }
221
222    // If only one type of search is enabled, return those results directly
223    if !config.use_semantic {
224        return Ok(bm25_results
225            .into_iter()
226            .take(config.top_k)
227            .filter_map(|(chunk_id, score)| {
228                SearchResult::from_chunk_id(storage, chunk_id, score, None, Some(score))
229            })
230            .collect());
231    }
232
233    if !config.use_bm25 {
234        return Ok(semantic_results
235            .into_iter()
236            .take(config.top_k)
237            .filter_map(|(chunk_id, score)| {
238                SearchResult::from_chunk_id(storage, chunk_id, f64::from(score), Some(score), None)
239            })
240            .collect());
241    }
242
243    // Combine using RRF
244    let rrf_config = RrfConfig::new(config.rrf_k);
245
246    // Convert to ranked lists (already sorted by score descending)
247    let semantic_ranked: Vec<i64> = semantic_results.iter().map(|(id, _)| *id).collect();
248    let bm25_ranked: Vec<i64> = bm25_results.iter().map(|(id, _)| *id).collect();
249
250    let fused = reciprocal_rank_fusion(&[&semantic_ranked, &bm25_ranked], &rrf_config);
251
252    // Build result with original scores
253    let semantic_map: std::collections::HashMap<i64, f32> = semantic_results.into_iter().collect();
254    let bm25_map: std::collections::HashMap<i64, f64> = bm25_results.into_iter().collect();
255
256    let results: Vec<SearchResult> = fused
257        .into_iter()
258        .take(config.top_k)
259        .filter_map(|(chunk_id, rrf_score)| {
260            SearchResult::from_chunk_id(
261                storage,
262                chunk_id,
263                rrf_score,
264                semantic_map.get(&chunk_id).copied(),
265                bm25_map.get(&chunk_id).copied(),
266            )
267        })
268        .collect();
269
270    Ok(results)
271}
272
273/// Performs semantic similarity search.
274///
275/// Uses cosine similarity between query embedding and stored chunk embeddings.
276fn semantic_search(
277    storage: &SqliteStorage,
278    embedder: &dyn Embedder,
279    query: &str,
280    config: &SearchConfig,
281) -> Result<Vec<(i64, f32)>> {
282    use rayon::prelude::*;
283
284    // Generate query embedding
285    let query_embedding = embedder.embed(query)?;
286
287    // Get all embeddings from storage
288    let all_embeddings = storage.get_all_embeddings()?;
289
290    if all_embeddings.is_empty() {
291        return Ok(Vec::new());
292    }
293
294    // Calculate similarities in parallel (rayon data parallelism)
295    let mut similarities: Vec<(i64, f32)> = all_embeddings
296        .par_iter()
297        .map(|(chunk_id, embedding)| {
298            let sim = cosine_similarity(&query_embedding, embedding);
299            (*chunk_id, sim)
300        })
301        .filter(|(_, sim)| *sim >= config.similarity_threshold)
302        .collect();
303
304    // Sort by similarity descending
305    similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
306
307    // Limit results
308    similarities.truncate(config.top_k * 2);
309
310    Ok(similarities)
311}
312
313/// Performs semantic-only search.
314///
315/// # Arguments
316///
317/// * `storage` - The storage backend.
318/// * `embedder` - The embedding generator.
319/// * `query` - The search query text.
320/// * `top_k` - Maximum number of results.
321/// * `threshold` - Minimum similarity threshold.
322///
323/// # Errors
324///
325/// Returns an error if search fails.
326pub fn search_semantic(
327    storage: &SqliteStorage,
328    embedder: &dyn Embedder,
329    query: &str,
330    top_k: usize,
331    threshold: f32,
332) -> Result<Vec<SearchResult>> {
333    let config = SearchConfig::new()
334        .with_top_k(top_k)
335        .with_threshold(threshold)
336        .with_semantic(true)
337        .with_bm25(false);
338
339    hybrid_search(storage, embedder, query, &config)
340}
341
342/// Performs BM25-only search.
343///
344/// # Arguments
345///
346/// * `storage` - The storage backend.
347/// * `query` - The search query text.
348/// * `top_k` - Maximum number of results.
349///
350/// # Errors
351///
352/// Returns an error if search fails.
353pub fn search_bm25(
354    storage: &SqliteStorage,
355    query: &str,
356    top_k: usize,
357) -> Result<Vec<SearchResult>> {
358    let results = storage.search_fts(query, top_k)?;
359
360    Ok(results
361        .into_iter()
362        .filter_map(|(chunk_id, score)| {
363            SearchResult::from_chunk_id(storage, chunk_id, score, None, Some(score))
364        })
365        .collect())
366}
367
368/// Generates and stores embeddings for all chunks in a buffer.
369///
370/// # Arguments
371///
372/// * `storage` - The storage backend (mutable for storing embeddings).
373/// * `embedder` - The embedding generator.
374/// * `buffer_id` - The buffer ID to process.
375///
376/// # Returns
377///
378/// The number of chunks embedded.
379///
380/// # Errors
381///
382/// Returns an error if embedding generation or storage fails.
383pub fn embed_buffer_chunks(
384    storage: &mut SqliteStorage,
385    embedder: &dyn Embedder,
386    buffer_id: i64,
387) -> Result<usize> {
388    let chunks = storage.get_chunks(buffer_id)?;
389
390    if chunks.is_empty() {
391        return Ok(0);
392    }
393
394    // Collect chunk texts for batch embedding
395    let texts: Vec<&str> = chunks.iter().map(|c| c.content.as_str()).collect();
396
397    // Generate embeddings in batch
398    let embeddings = embedder.embed_batch(&texts)?;
399
400    // Prepare batch for storage
401    let batch: Vec<(i64, Vec<f32>)> = chunks
402        .iter()
403        .zip(embeddings)
404        .filter_map(|(chunk, embedding)| chunk.id.map(|id| (id, embedding)))
405        .collect();
406
407    let count = batch.len();
408
409    // Store embeddings with model name for version tracking
410    storage.store_embeddings_batch(&batch, Some(embedder.model_name()))?;
411
412    Ok(count)
413}
414
415/// Checks if a buffer has all chunks embedded.
416///
417/// Uses a single SQL query (`NOT EXISTS`) instead of per-chunk lookups, reducing database
418/// round-trips from O(n) to O(1) for a buffer with n chunks.
419///
420/// # Errors
421///
422/// Returns an error if the check fails.
423pub fn buffer_fully_embedded(storage: &SqliteStorage, buffer_id: i64) -> Result<bool> {
424    storage.all_chunks_have_embeddings(buffer_id)
425}
426
427/// Checks if existing embeddings were created with a different model.
428///
429/// Returns `Some(existing_model)` if there's a model mismatch, `None` otherwise.
430///
431/// # Errors
432///
433/// Returns an error if the query fails.
434pub fn check_model_mismatch(
435    storage: &SqliteStorage,
436    buffer_id: i64,
437    current_model: &str,
438) -> Result<Option<String>> {
439    let models = storage.get_embedding_models(buffer_id)?;
440
441    // If no embeddings exist, no mismatch
442    if models.is_empty() {
443        return Ok(None);
444    }
445
446    // Check if any existing model differs from the current one
447    for model in models {
448        if model != current_model {
449            return Ok(Some(model));
450        }
451    }
452
453    Ok(None)
454}
455
456/// Information about embedding model versions for a buffer.
457#[derive(Debug, Clone)]
458pub struct EmbeddingModelInfo {
459    /// Model names and their embedding counts.
460    pub models: Vec<(Option<String>, i64)>,
461    /// Total number of embeddings.
462    pub total_embeddings: i64,
463    /// Whether there are mixed model versions.
464    pub has_mixed_models: bool,
465}
466
467/// Gets embedding model information for a buffer.
468///
469/// # Errors
470///
471/// Returns an error if the query fails.
472pub fn get_embedding_model_info(
473    storage: &SqliteStorage,
474    buffer_id: i64,
475) -> Result<EmbeddingModelInfo> {
476    let models = storage.get_embedding_model_counts(buffer_id)?;
477    let total_embeddings: i64 = models.iter().map(|(_, count)| count).sum();
478    let distinct_models: std::collections::HashSet<_> =
479        models.iter().map(|(name, _)| name.as_deref()).collect();
480    let has_mixed_models = distinct_models.len() > 1;
481
482    Ok(EmbeddingModelInfo {
483        models,
484        total_embeddings,
485        has_mixed_models,
486    })
487}
488
489/// Result of an incremental embedding operation.
490#[derive(Debug, Clone)]
491pub struct IncrementalEmbedResult {
492    /// Number of new embeddings created.
493    pub embedded_count: usize,
494    /// Number of chunks that were skipped (already embedded with correct model).
495    pub skipped_count: usize,
496    /// Number of embeddings that were replaced (different model).
497    pub replaced_count: usize,
498    /// Total chunks in the buffer.
499    pub total_chunks: usize,
500    /// Model name used for embedding.
501    pub model_name: String,
502}
503
504impl IncrementalEmbedResult {
505    /// Returns true if any embeddings were created or updated.
506    #[must_use]
507    pub const fn had_changes(&self) -> bool {
508        self.embedded_count > 0 || self.replaced_count > 0
509    }
510
511    /// Returns the percentage of chunks now embedded.
512    #[must_use]
513    #[allow(clippy::cast_precision_loss)] // Acceptable for percentage calculation
514    pub fn completion_percentage(&self) -> f64 {
515        if self.total_chunks == 0 {
516            100.0
517        } else {
518            let completed = self.embedded_count + self.skipped_count + self.replaced_count;
519            (completed as f64 / self.total_chunks as f64) * 100.0
520        }
521    }
522}
523
524/// Incrementally embeds chunks in a buffer.
525///
526/// Only embeds chunks that:
527/// - Have no embedding, OR
528/// - Have an embedding from a different model (if `force_reembed` is true)
529///
530/// This is more efficient than `embed_buffer_chunks` for large buffers
531/// where only a few chunks have changed.
532///
533/// # Arguments
534///
535/// * `storage` - The storage backend.
536/// * `embedder` - The embedder to use.
537/// * `buffer_id` - The buffer to embed.
538/// * `force_reembed` - If true, re-embeds chunks with different models.
539///
540/// # Returns
541///
542/// An `IncrementalEmbedResult` with statistics about what was done.
543///
544/// # Errors
545///
546/// Returns an error if embedding generation or storage fails.
547pub fn embed_buffer_chunks_incremental(
548    storage: &mut SqliteStorage,
549    embedder: &dyn Embedder,
550    buffer_id: i64,
551    force_reembed: bool,
552) -> Result<IncrementalEmbedResult> {
553    let current_model = embedder.model_name();
554    let stats = storage.get_embedding_stats(buffer_id)?;
555    let total_chunks = stats.total_chunks;
556
557    // Determine which chunks need embedding
558    let model_to_check = if force_reembed {
559        Some(current_model)
560    } else {
561        None
562    };
563
564    let chunk_ids_to_embed = storage.get_chunks_needing_embedding(buffer_id, model_to_check)?;
565
566    if chunk_ids_to_embed.is_empty() {
567        return Ok(IncrementalEmbedResult {
568            embedded_count: 0,
569            skipped_count: total_chunks,
570            replaced_count: 0,
571            total_chunks,
572            model_name: current_model.to_string(),
573        });
574    }
575
576    // Load the chunks we need to embed
577    let all_chunks = storage.get_chunks(buffer_id)?;
578    let chunks_to_embed: Vec<_> = all_chunks
579        .iter()
580        .filter(|c| c.id.is_some_and(|id| chunk_ids_to_embed.contains(&id)))
581        .collect();
582
583    // Count how many are replacements (had embeddings before)
584    let mut replaced_count = 0;
585    for chunk in &chunks_to_embed {
586        if let Some(id) = chunk.id
587            && storage.has_embedding(id)?
588        {
589            replaced_count += 1;
590        }
591    }
592
593    // Generate embeddings
594    let texts: Vec<&str> = chunks_to_embed.iter().map(|c| c.content.as_str()).collect();
595    let embeddings = embedder.embed_batch(&texts)?;
596
597    // Store embeddings
598    let batch: Vec<(i64, Vec<f32>)> = chunks_to_embed
599        .iter()
600        .zip(embeddings)
601        .filter_map(|(chunk, embedding)| chunk.id.map(|id| (id, embedding)))
602        .collect();
603
604    let embedded_count = batch.len();
605    storage.store_embeddings_batch(&batch, Some(current_model))?;
606
607    let new_embeddings = embedded_count - replaced_count;
608    let skipped_count = total_chunks - embedded_count;
609
610    Ok(IncrementalEmbedResult {
611        embedded_count: new_embeddings,
612        skipped_count,
613        replaced_count,
614        total_chunks,
615        model_name: current_model.to_string(),
616    })
617}
618
619#[cfg(test)]
620mod tests {
621    use super::*;
622    use crate::core::{Buffer, Chunk};
623    use crate::embedding::{DEFAULT_DIMENSIONS, FallbackEmbedder};
624    use crate::storage::Storage;
625
626    fn setup_storage() -> SqliteStorage {
627        let mut storage = SqliteStorage::in_memory().unwrap();
628        storage.init().unwrap();
629        storage
630    }
631
632    fn setup_storage_with_chunks() -> SqliteStorage {
633        let mut storage = setup_storage();
634
635        // Create a buffer
636        let buffer = Buffer::from_named(
637            "test.txt".to_string(),
638            "Test content for searching".to_string(),
639        );
640        let buffer_id = storage.add_buffer(&buffer).unwrap();
641
642        // Create chunks with different content
643        let chunks = vec![
644            Chunk::new(
645                buffer_id,
646                "The quick brown fox jumps over the lazy dog".to_string(),
647                0..44,
648                0,
649            ),
650            Chunk::new(
651                buffer_id,
652                "Machine learning is a subset of artificial intelligence".to_string(),
653                44..100,
654                1,
655            ),
656            Chunk::new(
657                buffer_id,
658                "Rust is a systems programming language".to_string(),
659                100..139,
660                2,
661            ),
662        ];
663
664        storage.add_chunks(buffer_id, &chunks).unwrap();
665
666        storage
667    }
668
669    #[test]
670    fn test_search_config_default() {
671        let config = SearchConfig::default();
672        assert_eq!(config.top_k, DEFAULT_TOP_K);
673        assert!((config.similarity_threshold - DEFAULT_SIMILARITY_THRESHOLD).abs() < f32::EPSILON);
674        assert_eq!(config.rrf_k, 60);
675        assert!(config.use_semantic);
676        assert!(config.use_bm25);
677    }
678
679    #[test]
680    fn test_search_config_builder() {
681        let config = SearchConfig::new()
682            .with_top_k(20)
683            .with_threshold(0.5)
684            .with_rrf_k(30)
685            .with_semantic(false)
686            .with_bm25(true);
687
688        assert_eq!(config.top_k, 20);
689        assert!((config.similarity_threshold - 0.5).abs() < f32::EPSILON);
690        assert_eq!(config.rrf_k, 30);
691        assert!(!config.use_semantic);
692        assert!(config.use_bm25);
693    }
694
695    #[test]
696    fn test_search_bm25() {
697        let storage = setup_storage_with_chunks();
698
699        // Search for "fox" - should find the first chunk
700        let results = search_bm25(&storage, "fox", 10).unwrap();
701        assert!(!results.is_empty());
702        assert!(results[0].bm25_score.is_some());
703        assert!(results[0].semantic_score.is_none());
704    }
705
706    #[test]
707    fn test_search_bm25_no_results() {
708        let storage = setup_storage_with_chunks();
709
710        // Search for something not in the content
711        let results = search_bm25(&storage, "xyz123nonexistent", 10).unwrap();
712        assert!(results.is_empty());
713    }
714
715    #[test]
716    fn test_embed_buffer_chunks() {
717        let mut storage = setup_storage_with_chunks();
718        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
719
720        // Embed chunks for buffer 1
721        let count = embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
722        assert_eq!(count, 3); // We created 3 chunks
723    }
724
725    #[test]
726    fn test_embed_buffer_chunks_empty() {
727        let mut storage = setup_storage();
728        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
729
730        // Create buffer with no chunks
731        let buffer = Buffer::from_named("empty.txt".to_string(), String::new());
732        let buffer_id = storage.add_buffer(&buffer).unwrap();
733
734        let count = embed_buffer_chunks(&mut storage, &embedder, buffer_id).unwrap();
735        assert_eq!(count, 0);
736    }
737
738    #[test]
739    fn test_buffer_fully_embedded_empty() {
740        let mut storage = setup_storage();
741
742        // Create buffer with no chunks
743        let buffer = Buffer::from_named("empty.txt".to_string(), String::new());
744        let buffer_id = storage.add_buffer(&buffer).unwrap();
745
746        // Empty buffer should be "fully embedded"
747        let result = buffer_fully_embedded(&storage, buffer_id).unwrap();
748        assert!(result);
749    }
750
751    #[test]
752    fn test_buffer_fully_embedded_with_embeddings() {
753        let mut storage = setup_storage_with_chunks();
754        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
755
756        // Before embedding
757        let result = buffer_fully_embedded(&storage, 1).unwrap();
758        assert!(!result);
759
760        // Embed all chunks
761        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
762
763        // After embedding
764        let result = buffer_fully_embedded(&storage, 1).unwrap();
765        assert!(result);
766    }
767
768    #[test]
769    fn test_hybrid_search_bm25_only() {
770        let storage = setup_storage_with_chunks();
771        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
772
773        let config = SearchConfig::new().with_semantic(false).with_bm25(true);
774
775        let results = hybrid_search(&storage, &embedder, "programming", &config).unwrap();
776        // Should find "Rust is a systems programming language"
777        assert!(!results.is_empty());
778        assert!(results[0].bm25_score.is_some());
779        assert!(results[0].semantic_score.is_none());
780    }
781
782    #[test]
783    fn test_hybrid_search_semantic_only() {
784        let mut storage = setup_storage_with_chunks();
785        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
786
787        // First embed the chunks
788        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
789
790        let config = SearchConfig::new()
791            .with_semantic(true)
792            .with_bm25(false)
793            .with_threshold(0.0); // Low threshold for fallback embedder
794
795        let results = hybrid_search(&storage, &embedder, "programming language", &config).unwrap();
796        assert!(!results.is_empty());
797        assert!(results[0].semantic_score.is_some());
798        assert!(results[0].bm25_score.is_none());
799    }
800
801    #[test]
802    fn test_hybrid_search_both() {
803        let mut storage = setup_storage_with_chunks();
804        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
805
806        // First embed the chunks
807        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
808
809        let config = SearchConfig::new()
810            .with_semantic(true)
811            .with_bm25(true)
812            .with_threshold(0.0); // Low threshold for fallback embedder
813
814        let results = hybrid_search(&storage, &embedder, "programming", &config).unwrap();
815        assert!(!results.is_empty());
816    }
817
818    #[test]
819    fn test_search_semantic() {
820        let mut storage = setup_storage_with_chunks();
821        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
822
823        // First embed the chunks
824        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
825
826        let results = search_semantic(&storage, &embedder, "test query", 10, 0.0).unwrap();
827        // Should return results with semantic scores only
828        for result in &results {
829            assert!(result.semantic_score.is_some());
830            assert!(result.bm25_score.is_none());
831        }
832    }
833
834    #[test]
835    fn test_search_semantic_empty_embeddings() {
836        let storage = setup_storage_with_chunks();
837        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
838
839        // Don't embed chunks - search should return empty
840        let results = search_semantic(&storage, &embedder, "test query", 10, 0.5).unwrap();
841        assert!(results.is_empty());
842    }
843
844    #[test]
845    fn test_incremental_embed_new_chunks() {
846        let mut storage = setup_storage_with_chunks();
847        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
848
849        // First incremental embed - should embed all 3 chunks
850        let result = embed_buffer_chunks_incremental(&mut storage, &embedder, 1, false).unwrap();
851        assert_eq!(result.embedded_count, 3);
852        assert_eq!(result.skipped_count, 0);
853        assert_eq!(result.replaced_count, 0);
854        assert_eq!(result.total_chunks, 3);
855        assert!(result.had_changes());
856
857        // Second incremental embed - should skip all (already embedded)
858        let result2 = embed_buffer_chunks_incremental(&mut storage, &embedder, 1, false).unwrap();
859        assert_eq!(result2.embedded_count, 0);
860        assert_eq!(result2.skipped_count, 3);
861        assert_eq!(result2.replaced_count, 0);
862        assert!(!result2.had_changes());
863    }
864
865    #[test]
866    fn test_incremental_embed_force_reembed() {
867        let mut storage = setup_storage_with_chunks();
868        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
869
870        // First embed normally
871        embed_buffer_chunks_incremental(&mut storage, &embedder, 1, false).unwrap();
872
873        // Force re-embed - should replace all 3
874        let result = embed_buffer_chunks_incremental(&mut storage, &embedder, 1, true).unwrap();
875        // All chunks already have correct model, so no changes needed even with force
876        // (force only affects different-model embeddings)
877        assert_eq!(result.skipped_count, 3);
878        assert!(!result.had_changes());
879    }
880
881    #[test]
882    fn test_parallel_semantic_search() {
883        let mut storage = setup_storage_with_chunks();
884        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
885
886        // Embed all chunks so semantic_search has data to work with
887        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
888
889        // Use a zero threshold to capture all results from the fallback embedder
890        let config = SearchConfig::new().with_top_k(10).with_threshold(0.0);
891
892        let results = semantic_search(&storage, &embedder, "test query", &config).unwrap();
893
894        // Should return results (we embedded 3 chunks)
895        assert!(!results.is_empty());
896
897        // Verify results are sorted by descending similarity
898        for window in results.windows(2) {
899            assert!(
900                window[0].1 >= window[1].1,
901                "Results should be sorted by descending similarity: {} >= {}",
902                window[0].1,
903                window[1].1,
904            );
905        }
906
907        // Verify threshold filtering: with a very high threshold, results should be excluded
908        let strict_config = SearchConfig::new().with_top_k(10).with_threshold(0.99);
909        let strict_results =
910            semantic_search(&storage, &embedder, "test query", &strict_config).unwrap();
911
912        // Strict threshold should return fewer (or no) results
913        assert!(
914            strict_results.len() <= results.len(),
915            "Strict threshold should not return more results than lenient threshold",
916        );
917    }
918
919    #[test]
920    fn test_incremental_embed_result_completion() {
921        let result = IncrementalEmbedResult {
922            embedded_count: 2,
923            skipped_count: 3,
924            replaced_count: 0,
925            total_chunks: 5,
926            model_name: "test".to_string(),
927        };
928        assert!(result.had_changes());
929        assert!((result.completion_percentage() - 100.0).abs() < f64::EPSILON);
930    }
931
932    #[test]
933    fn test_completion_percentage_zero_chunks() {
934        let result = IncrementalEmbedResult {
935            embedded_count: 0,
936            skipped_count: 0,
937            replaced_count: 0,
938            total_chunks: 0,
939            model_name: "test".to_string(),
940        };
941        // Zero-chunk buffer is considered 100% complete
942        assert!((result.completion_percentage() - 100.0).abs() < f64::EPSILON);
943        assert!(!result.had_changes());
944    }
945
946    #[test]
947    fn test_completion_percentage_partial() {
948        let result = IncrementalEmbedResult {
949            embedded_count: 1,
950            skipped_count: 1,
951            replaced_count: 0,
952            total_chunks: 4,
953            model_name: "test".to_string(),
954        };
955        assert!((result.completion_percentage() - 50.0).abs() < f64::EPSILON);
956        assert!(result.had_changes());
957    }
958
959    #[test]
960    fn test_had_changes_replaced_only() {
961        let result = IncrementalEmbedResult {
962            embedded_count: 0,
963            skipped_count: 2,
964            replaced_count: 1,
965            total_chunks: 3,
966            model_name: "test".to_string(),
967        };
968        assert!(result.had_changes());
969    }
970
971    #[test]
972    fn test_check_model_mismatch_no_embeddings() {
973        let mut storage = setup_storage();
974        let buffer = crate::core::Buffer::from_named("x.txt".to_string(), "hi".to_string());
975        let buffer_id = storage.add_buffer(&buffer).unwrap();
976
977        // No embeddings → no mismatch
978        let result = check_model_mismatch(&storage, buffer_id, "model-a").unwrap();
979        assert!(result.is_none());
980    }
981
982    #[test]
983    fn test_check_model_mismatch_same_model() {
984        let mut storage = setup_storage_with_chunks();
985        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
986
987        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
988
989        // Same model → no mismatch
990        let result = check_model_mismatch(&storage, 1, embedder.model_name()).unwrap();
991        assert!(result.is_none());
992    }
993
994    #[test]
995    fn test_check_model_mismatch_different_model() {
996        let mut storage = setup_storage_with_chunks();
997        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
998
999        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
1000
1001        // Ask with a different model name → returns the existing model
1002        let result = check_model_mismatch(&storage, 1, "some-other-model").unwrap();
1003        assert!(result.is_some());
1004        assert_eq!(result.unwrap(), embedder.model_name());
1005    }
1006
1007    #[test]
1008    fn test_get_embedding_model_info_no_embeddings() {
1009        let mut storage = setup_storage();
1010        let buffer = crate::core::Buffer::from_named("x.txt".to_string(), "hi".to_string());
1011        let buffer_id = storage.add_buffer(&buffer).unwrap();
1012
1013        let info = get_embedding_model_info(&storage, buffer_id).unwrap();
1014        assert_eq!(info.total_embeddings, 0);
1015        assert!(info.models.is_empty());
1016        assert!(!info.has_mixed_models);
1017    }
1018
1019    #[test]
1020    fn test_get_embedding_model_info_single_model() {
1021        let mut storage = setup_storage_with_chunks();
1022        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
1023
1024        embed_buffer_chunks(&mut storage, &embedder, 1).unwrap();
1025
1026        let info = get_embedding_model_info(&storage, 1).unwrap();
1027        assert_eq!(info.total_embeddings, 3);
1028        assert!(!info.has_mixed_models);
1029    }
1030
1031    #[test]
1032    fn test_populate_previews_short_content() {
1033        let storage = setup_storage_with_chunks();
1034
1035        // Get a result with a known chunk_id
1036        let results_raw = search_bm25(&storage, "fox", 1).unwrap();
1037        assert!(!results_raw.is_empty());
1038
1039        let mut results = results_raw;
1040        populate_previews(&storage, &mut results, 200).unwrap();
1041
1042        // Content is shorter than preview_len → no ellipsis
1043        let preview = results[0].content_preview.as_ref().unwrap();
1044        assert!(!preview.ends_with("..."));
1045        assert!(!preview.is_empty());
1046    }
1047
1048    #[test]
1049    fn test_populate_previews_truncates_long_content() {
1050        let mut storage = setup_storage();
1051        let buffer = crate::core::Buffer::from_named("long.txt".to_string(), String::new());
1052        let buffer_id = storage.add_buffer(&buffer).unwrap();
1053
1054        // Create a chunk with long content
1055        let long_content = "word ".repeat(100); // 500 chars
1056        let chunk =
1057            crate::core::Chunk::new(buffer_id, long_content.clone(), 0..long_content.len(), 0);
1058        storage.add_chunks(buffer_id, &[chunk]).unwrap();
1059
1060        let chunks = storage.get_chunks(buffer_id).unwrap();
1061        let chunk_id = chunks[0].id.unwrap();
1062
1063        // Build a synthetic SearchResult pointing at this chunk
1064        let mut results = vec![SearchResult {
1065            chunk_id,
1066            buffer_id,
1067            index: 0,
1068            score: 1.0,
1069            semantic_score: None,
1070            bm25_score: None,
1071            content_preview: None,
1072        }];
1073
1074        populate_previews(&storage, &mut results, 20).unwrap();
1075
1076        let preview = results[0].content_preview.as_ref().unwrap();
1077        assert!(
1078            preview.ends_with("..."),
1079            "Expected ellipsis, got: {preview}"
1080        );
1081        // Should be no longer than preview_len + "..."
1082        assert!(preview.len() <= 23);
1083    }
1084
1085    #[test]
1086    fn test_populate_previews_utf8_boundary() {
1087        let mut storage = setup_storage();
1088        let buffer = crate::core::Buffer::from_named("utf8.txt".to_string(), String::new());
1089        let buffer_id = storage.add_buffer(&buffer).unwrap();
1090
1091        // Content where a naive byte truncation would split a multi-byte char
1092        // '日' is 3 bytes; place it so it straddles the preview boundary
1093        let content = "hello \u{65E5}\u{672C}\u{8A9E}"; // "hello 日本語"
1094        let chunk = crate::core::Chunk::new(buffer_id, content.to_string(), 0..content.len(), 0);
1095        storage.add_chunks(buffer_id, &[chunk]).unwrap();
1096
1097        let chunks = storage.get_chunks(buffer_id).unwrap();
1098        let chunk_id = chunks[0].id.unwrap();
1099
1100        let mut results = vec![SearchResult {
1101            chunk_id,
1102            buffer_id,
1103            index: 0,
1104            score: 1.0,
1105            semantic_score: None,
1106            bm25_score: None,
1107            content_preview: None,
1108        }];
1109
1110        // preview_len=7 bytes falls inside '日' (which occupies bytes 6-8),
1111        // so the implementation must back up to the nearest valid UTF-8 boundary.
1112        populate_previews(&storage, &mut results, 7).unwrap();
1113
1114        let preview = results[0].content_preview.as_ref().unwrap();
1115        // Must end with ellipsis because the content is longer than preview_len
1116        assert!(preview.ends_with("..."), "Expected ellipsis in: {preview}");
1117        // The prefix before "..." must be exactly "hello " — truncated at the
1118        // char boundary before '日', not mid-way through its bytes.
1119        let body = preview.trim_end_matches("...");
1120        assert_eq!(
1121            body, "hello ",
1122            "Expected truncation at char boundary, got: {body:?}"
1123        );
1124    }
1125}
rlm_rs/search/mod.rs

rlm_rs/search/
mod.rs