brainwires-storage 0.7.0

Backend-agnostic storage, tiered memory, and document management for the Brainwires Agent Framework
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
use anyhow::{Context, Result};
use std::collections::HashMap;
use std::path::Path;
use std::sync::Mutex;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{Index, IndexWriter, ReloadPolicy, TantivyDocument, doc};

/// BM25-based keyword search using Tantivy
pub struct BM25Search {
    index: Index,
    id_field: Field,
    content_field: Field,
    file_path_field: Field,
    /// Path to the index directory (needed for lock cleanup)
    index_path: std::path::PathBuf,
    /// Mutex to ensure only one IndexWriter is created at a time
    writer_lock: Mutex<()>,
}

/// Search result from BM25
#[derive(Debug, Clone)]
pub struct BM25Result {
    /// Document identifier.
    pub id: u64,
    /// BM25 relevance score.
    pub score: f32,
}

impl BM25Search {
    /// Create a new BM25 search index
    pub fn new<P: AsRef<Path>>(index_path: P) -> Result<Self> {
        let index_path = index_path.as_ref().to_path_buf();

        // Create schema with ID, content, and file_path fields.
        // content is TEXT | STORED so documents can be retrieved after indexing.
        let mut schema_builder = Schema::builder();
        let id_field = schema_builder.add_u64_field("id", STORED | INDEXED);
        let content_field = schema_builder.add_text_field("content", TEXT | STORED);
        let file_path_field = schema_builder.add_text_field("file_path", STRING | STORED);
        let schema = schema_builder.build();

        // Create or open index, validating schema on reopen to detect drift.
        std::fs::create_dir_all(&index_path).context("Failed to create BM25 index directory")?;

        let index = if index_path.join("meta.json").exists() {
            let existing =
                Index::open_in_dir(&index_path).context("Failed to open existing BM25 index")?;
            // Validate that the on-disk schema matches the expected fields.
            // If it doesn't (e.g. after a schema change), recreate the index.
            let schema_ok = existing.schema().get_field("id").is_ok()
                && existing.schema().get_field("content").is_ok()
                && existing.schema().get_field("file_path").is_ok();
            if schema_ok {
                existing
            } else {
                tracing::warn!(
                    "BM25 index schema mismatch at {:?} — recreating index",
                    index_path
                );
                std::fs::remove_dir_all(&index_path)
                    .context("Failed to remove stale BM25 index")?;
                std::fs::create_dir_all(&index_path)
                    .context("Failed to recreate BM25 index directory")?;
                Index::create_in_dir(&index_path, schema.clone())
                    .context("Failed to recreate BM25 index")?
            }
        } else {
            Index::create_in_dir(&index_path, schema.clone())
                .context("Failed to create BM25 index")?
        };

        Ok(Self {
            index,
            id_field,
            content_field,
            file_path_field,
            index_path,
            writer_lock: Mutex::new(()),
        })
    }

    /// Check if a lock file is stale (older than 5 minutes with no recent activity)
    fn is_lock_stale(lock_path: &Path) -> bool {
        if !lock_path.exists() {
            return false;
        }

        // Check file modification time
        if let Ok(metadata) = std::fs::metadata(lock_path)
            && let Ok(modified) = metadata.modified()
            && let Ok(elapsed) = modified.elapsed()
        {
            // Consider lock stale if older than 5 minutes
            return elapsed.as_secs() > 300;
        }

        false
    }

    /// Try to clean up stale lock files only if they appear to be from crashed processes
    fn try_cleanup_stale_locks(index_path: &Path) -> Result<bool> {
        let writer_lock = index_path.join(".tantivy-writer.lock");
        let meta_lock = index_path.join(".tantivy-meta.lock");

        let writer_stale = Self::is_lock_stale(&writer_lock);
        let meta_stale = Self::is_lock_stale(&meta_lock);

        if !writer_stale && !meta_stale {
            return Ok(false); // Locks appear to be active
        }

        if writer_stale && writer_lock.exists() {
            tracing::warn!(
                "Removing stale Tantivy writer lock file (>5min old): {:?}",
                writer_lock
            );
            std::fs::remove_file(&writer_lock)
                .context("Failed to remove stale writer lock file")?;
        }

        if meta_stale && meta_lock.exists() {
            tracing::warn!(
                "Removing stale Tantivy meta lock file (>5min old): {:?}",
                meta_lock
            );
            std::fs::remove_file(&meta_lock).context("Failed to remove stale meta lock file")?;
        }

        Ok(true) // Cleaned up stale locks
    }

    /// Add documents to the index
    ///
    /// Arguments:
    /// * `documents` - Vec of (id, content, file_path) tuples
    pub fn add_documents(&self, documents: Vec<(u64, String, String)>) -> Result<()> {
        // Lock to ensure only one writer at a time (within this process)
        let _guard = self
            .writer_lock
            .lock()
            .map_err(|e| anyhow::anyhow!("Failed to acquire writer lock: {}", e))?;

        // Try to create the index writer
        let mut index_writer: IndexWriter<TantivyDocument> = match self.index.writer(50_000_000) {
            Ok(writer) => writer,
            Err(e) => {
                // Check if this is a lock error
                let error_msg = format!("{}", e);
                if error_msg.contains("lock") || error_msg.contains("Lock") {
                    tracing::warn!(
                        "Index writer creation failed (possibly locked), checking for stale locks..."
                    );

                    // Try to cleanup stale locks
                    match Self::try_cleanup_stale_locks(&self.index_path) {
                        Ok(true) => {
                            // Stale locks were cleaned up, retry once
                            tracing::info!("Stale locks cleaned up, retrying writer creation...");
                            self.index.writer(50_000_000).context(
                                "Failed to create index writer after cleaning stale locks",
                            )?
                        }
                        Ok(false) => {
                            // Locks exist but are not stale (another process is actively using the index)
                            return Err(anyhow::anyhow!(
                                "BM25 index is currently being used by another process. Please wait and try again later."
                            ));
                        }
                        Err(cleanup_err) => {
                            // Failed to cleanup locks
                            return Err(anyhow::anyhow!(
                                "Failed to create index writer (locked) and failed to cleanup stale locks: {}. Original error: {}",
                                cleanup_err,
                                e
                            ));
                        }
                    }
                } else {
                    // Not a lock error, propagate original error
                    return Err(e).context("Failed to create index writer");
                }
            }
        };

        for (id, content, file_path) in documents {
            let doc = doc!(
                self.id_field => id,
                self.content_field => content,
                self.file_path_field => file_path,
            );
            index_writer
                .add_document(doc)
                .context("Failed to add document")?;
        }

        index_writer
            .commit()
            .context("Failed to commit documents")?;

        Ok(())
    }

    /// Search the index with BM25 scoring
    pub fn search(&self, query_text: &str, limit: usize) -> Result<Vec<BM25Result>> {
        let reader = self
            .index
            .reader_builder()
            .reload_policy(ReloadPolicy::Manual)
            .try_into()
            .context("Failed to create index reader")?;

        let searcher = reader.searcher();

        // Parse query using lenient mode to handle special characters like :: in code
        // (e.g., "Tool::new" would fail strict parsing since : is a field separator)
        let query_parser = QueryParser::for_index(&self.index, vec![self.content_field]);
        let (query, errors) = query_parser.parse_query_lenient(query_text);
        if !errors.is_empty() {
            tracing::warn!(
                "BM25 query parse issues for {:?} (terms may have been dropped): {:?}",
                query_text,
                errors
            );
        }

        // Search with BM25
        let top_docs = searcher
            .search(&query, &TopDocs::with_limit(limit))
            .context("Failed to execute search")?;

        let mut results = Vec::new();
        for (score, doc_address) in top_docs {
            let retrieved_doc: TantivyDocument = searcher
                .doc(doc_address)
                .context("Failed to retrieve document")?;

            match retrieved_doc
                .get_first(self.id_field)
                .and_then(|v| v.as_u64())
            {
                Some(id) => results.push(BM25Result { id, score }),
                None => tracing::warn!(
                    "BM25: document at {:?} is missing or has corrupt 'id' field — skipping",
                    doc_address
                ),
            }
        }

        Ok(results)
    }

    /// Delete all documents for a specific ID
    pub fn delete_by_id(&self, id: u64) -> Result<()> {
        // Lock to ensure only one writer at a time
        let _guard = self
            .writer_lock
            .lock()
            .map_err(|e| anyhow::anyhow!("Failed to acquire writer lock: {}", e))?;

        let mut index_writer: IndexWriter<TantivyDocument> = self
            .index
            .writer(50_000_000)
            .context("Failed to create index writer")?;

        let term = Term::from_field_u64(self.id_field, id);
        index_writer.delete_term(term);

        index_writer.commit().context("Failed to commit deletion")?;

        Ok(())
    }

    /// Delete all documents with a specific file_path
    ///
    /// This is used for incremental updates when files are deleted or modified.
    pub fn delete_by_file_path(&self, file_path: &str) -> Result<usize> {
        // Lock to ensure only one writer at a time
        let _guard = self
            .writer_lock
            .lock()
            .map_err(|e| anyhow::anyhow!("Failed to acquire writer lock: {}", e))?;

        let mut index_writer: IndexWriter<TantivyDocument> = self
            .index
            .writer(50_000_000)
            .context("Failed to create index writer")?;

        let term = Term::from_field_text(self.file_path_field, file_path);
        index_writer.delete_term(term);

        index_writer
            .commit()
            .context("Failed to commit file_path deletion")?;

        // Note: Tantivy doesn't return count of deleted documents
        // Return 0 as placeholder
        Ok(0)
    }

    /// Clear the entire index
    pub fn clear(&self) -> Result<()> {
        // Lock to ensure only one writer at a time
        let _guard = self
            .writer_lock
            .lock()
            .map_err(|e| anyhow::anyhow!("Failed to acquire writer lock: {}", e))?;

        let mut index_writer: IndexWriter<TantivyDocument> = self
            .index
            .writer(50_000_000)
            .context("Failed to create index writer")?;

        index_writer
            .delete_all_documents()
            .context("Failed to delete all documents")?;

        index_writer.commit().context("Failed to commit clear")?;

        Ok(())
    }

    /// Get index statistics
    pub fn get_stats(&self) -> Result<BM25Stats> {
        let reader = self
            .index
            .reader_builder()
            .reload_policy(ReloadPolicy::Manual)
            .try_into()
            .context("Failed to create index reader")?;

        let searcher = reader.searcher();
        let total_docs = searcher.num_docs() as usize;

        Ok(BM25Stats {
            total_documents: total_docs,
        })
    }
}

/// Statistics about the BM25 index
#[derive(Debug, Clone)]
pub struct BM25Stats {
    /// Total number of indexed documents.
    pub total_documents: usize,
}

/// Trait for custom search scoring/fusion strategies.
///
/// Implement this trait to replace the default Reciprocal Rank Fusion (RRF) with
/// your own fusion algorithm (e.g., weighted linear combination, learned fusion,
/// cross-encoder reranking).
///
/// # Example
///
/// ```rust,ignore
/// use brainwires_storage::bm25_search::{SearchScorer, BM25Result};
///
/// struct WeightedFusion { vector_weight: f32, keyword_weight: f32 }
///
/// impl SearchScorer for WeightedFusion {
///     fn fuse(
///         &self,
///         vector_results: Vec<(u64, f32)>,
///         bm25_results: Vec<BM25Result>,
///         limit: usize,
///     ) -> Vec<(u64, f32)> {
///         // Your custom fusion logic here
///         vec![]
///     }
/// }
/// ```
pub trait SearchScorer: Send + Sync {
    /// Combine vector search and BM25 keyword results into a single ranked list.
    ///
    /// - `vector_results`: (id, similarity_score) pairs from vector search, sorted by score desc
    /// - `bm25_results`: keyword search results with raw BM25 scores
    /// - `limit`: maximum number of combined results to return
    ///
    /// Returns (id, combined_score) pairs sorted by score descending.
    fn fuse(
        &self,
        vector_results: Vec<(u64, f32)>,
        bm25_results: Vec<BM25Result>,
        limit: usize,
    ) -> Vec<(u64, f32)>;
}

/// Standard RRF constant (60.0 is the commonly used value from the RRF paper)
pub const RRF_K_CONSTANT: f32 = 60.0;

/// Default scorer using Reciprocal Rank Fusion (RRF).
///
/// The standard RRF approach from the paper, using k=60.
pub struct RrfScorer;

impl SearchScorer for RrfScorer {
    fn fuse(
        &self,
        vector_results: Vec<(u64, f32)>,
        bm25_results: Vec<BM25Result>,
        limit: usize,
    ) -> Vec<(u64, f32)> {
        reciprocal_rank_fusion(vector_results, bm25_results, limit)
    }
}

/// Reciprocal Rank Fusion (RRF) for combining vector and BM25 results
///
/// This is a convenience wrapper around `reciprocal_rank_fusion_generic` for the common case
/// of combining vector search results (u64 IDs) with BM25 results.
pub fn reciprocal_rank_fusion(
    vector_results: Vec<(u64, f32)>,
    bm25_results: Vec<BM25Result>,
    k: usize,
) -> Vec<(u64, f32)> {
    // Convert BM25 results to the same format as vector results
    let bm25_tuples: Vec<(u64, f32)> = bm25_results.into_iter().map(|r| (r.id, r.score)).collect();

    // Use the generic implementation
    reciprocal_rank_fusion_generic([vector_results, bm25_tuples], k)
}

/// Generic Reciprocal Rank Fusion (RRF) for combining arbitrary ranked lists
///
/// This is a generic version that works with any type that implements Eq + Hash + Clone.
/// Useful for combining results from different search systems.
///
/// # Arguments
/// * `ranked_lists` - Iterator of ranked result lists, each containing (id, original_score)
/// * `limit` - Maximum results to return
///
/// # Returns
/// Vec of (id, combined_rrf_score) sorted by score descending
pub fn reciprocal_rank_fusion_generic<T, I, L>(ranked_lists: I, limit: usize) -> Vec<(T, f32)>
where
    T: Eq + std::hash::Hash + Clone,
    I: IntoIterator<Item = L>,
    L: IntoIterator<Item = (T, f32)>,
{
    let mut score_map: HashMap<T, f32> = HashMap::new();

    for list in ranked_lists {
        for (rank, (id, _score)) in list.into_iter().enumerate() {
            let rrf_score = 1.0 / (RRF_K_CONSTANT + (rank + 1) as f32);
            *score_map.entry(id).or_insert(0.0) += rrf_score;
        }
    }

    let mut combined: Vec<(T, f32)> = score_map.into_iter().collect();
    combined.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
    combined.truncate(limit);

    combined
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    // ── reciprocal_rank_fusion ────────────────────────────────────────────

    #[test]
    fn rrf_empty_inputs_returns_empty() {
        let result = reciprocal_rank_fusion(vec![], vec![], 10);
        assert!(result.is_empty());
    }

    #[test]
    fn rrf_vector_only_result_ranked_first_gets_highest_score() {
        let vector_results = vec![(1u64, 0.9), (2u64, 0.8), (3u64, 0.7)];
        let result = reciprocal_rank_fusion(vector_results, vec![], 3);
        // Item 1 should score higher than item 2 (rank 0 vs rank 1)
        let scores: Vec<u64> = result.iter().map(|(id, _)| *id).collect();
        assert!(scores.contains(&1));
        assert!(scores.contains(&2));
        assert!(scores.contains(&3));
        // Id 1 was rank 0 -> highest RRF score
        let id1_score = result.iter().find(|(id, _)| *id == 1).unwrap().1;
        let id2_score = result.iter().find(|(id, _)| *id == 2).unwrap().1;
        assert!(id1_score > id2_score);
    }

    #[test]
    fn rrf_limit_caps_result_count() {
        let vector_results = vec![(1u64, 1.0), (2u64, 0.9), (3u64, 0.8), (4u64, 0.7)];
        let result = reciprocal_rank_fusion(vector_results, vec![], 2);
        assert_eq!(result.len(), 2);
    }

    #[test]
    fn rrf_item_in_both_lists_ranks_higher() {
        // Item 10 appears in both vector and bm25 results
        let vector_results = vec![(10u64, 0.9), (20u64, 0.8)];
        let bm25_results = vec![
            BM25Result { id: 10, score: 0.9 },
            BM25Result { id: 30, score: 0.7 },
        ];
        let result = reciprocal_rank_fusion(vector_results, bm25_results, 10);
        // Item 10 should have higher combined score than items in only one list
        let score_10 = result.iter().find(|(id, _)| *id == 10).unwrap().1;
        let score_20 = result.iter().find(|(id, _)| *id == 20).unwrap().1;
        let score_30 = result.iter().find(|(id, _)| *id == 30).unwrap().1;
        assert!(
            score_10 > score_20,
            "item in both lists should beat vector-only"
        );
        assert!(
            score_10 > score_30,
            "item in both lists should beat bm25-only"
        );
    }

    #[test]
    fn rrf_generic_string_ids_work() {
        let list1 = vec![("a".to_string(), 1.0f32), ("b".to_string(), 0.5)];
        let list2 = vec![("b".to_string(), 1.0f32), ("c".to_string(), 0.5)];
        let result = reciprocal_rank_fusion_generic([list1, list2], 10);
        // "b" appears in both, should have higher score
        let score_b = result.iter().find(|(id, _)| id == "b").unwrap().1;
        let score_a = result.iter().find(|(id, _)| id == "a").unwrap().1;
        let score_c = result.iter().find(|(id, _)| id == "c").unwrap().1;
        assert!(score_b > score_a);
        assert!(score_b > score_c);
    }

    #[test]
    fn rrf_k_constant_is_60() {
        assert_eq!(RRF_K_CONSTANT, 60.0);
    }

    #[test]
    fn rrf_score_for_rank_zero_is_one_over_61() {
        // At rank 0 (first item): 1 / (60 + 1) = 1/61
        let vector_results = vec![(42u64, 1.0)];
        let result = reciprocal_rank_fusion(vector_results, vec![], 1);
        let score = result[0].1;
        let expected = 1.0 / 61.0f32;
        assert!(
            (score - expected).abs() < 1e-6,
            "score={score}, expected={expected}"
        );
    }

    // ── BM25Search ────────────────────────────────────────────────────────

    #[test]
    fn bm25search_creates_index_in_temp_dir() {
        let dir = TempDir::new().unwrap();
        let search = BM25Search::new(dir.path()).unwrap();
        let stats = search.get_stats().unwrap();
        assert_eq!(stats.total_documents, 0);
    }

    #[test]
    fn bm25search_add_and_count_documents() {
        let dir = TempDir::new().unwrap();
        let search = BM25Search::new(dir.path()).unwrap();
        search
            .add_documents(vec![
                (1, "the quick brown fox".to_string(), "file1.rs".to_string()),
                (
                    2,
                    "jumps over the lazy dog".to_string(),
                    "file2.rs".to_string(),
                ),
            ])
            .unwrap();
        let stats = search.get_stats().unwrap();
        assert_eq!(stats.total_documents, 2);
    }

    #[test]
    fn bm25search_returns_relevant_results() {
        let dir = TempDir::new().unwrap();
        let search = BM25Search::new(dir.path()).unwrap();
        search
            .add_documents(vec![
                (
                    1,
                    "authentication login user password".to_string(),
                    "auth.rs".to_string(),
                ),
                (
                    2,
                    "database storage connection pool".to_string(),
                    "db.rs".to_string(),
                ),
                (
                    3,
                    "authentication oauth token".to_string(),
                    "oauth.rs".to_string(),
                ),
            ])
            .unwrap();

        let results = search.search("authentication", 10).unwrap();
        assert!(
            !results.is_empty(),
            "should find results for 'authentication'"
        );
        // All results should have positive score
        for r in &results {
            assert!(r.score > 0.0);
        }
        // Should find docs 1 and 3 but not 2
        let ids: Vec<u64> = results.iter().map(|r| r.id).collect();
        assert!(ids.contains(&1) || ids.contains(&3));
    }

    #[test]
    fn bm25search_search_returns_empty_for_unknown_term() {
        let dir = TempDir::new().unwrap();
        let search = BM25Search::new(dir.path()).unwrap();
        search
            .add_documents(vec![(1, "some content".to_string(), "f.rs".to_string())])
            .unwrap();
        let results = search.search("xyzabsolutelynotinindex", 10).unwrap();
        assert!(results.is_empty());
    }

    #[test]
    fn bm25search_clear_removes_all_documents() {
        let dir = TempDir::new().unwrap();
        let search = BM25Search::new(dir.path()).unwrap();
        search
            .add_documents(vec![(1, "content".to_string(), "f.rs".to_string())])
            .unwrap();
        search.clear().unwrap();
        let stats = search.get_stats().unwrap();
        assert_eq!(stats.total_documents, 0);
    }

    #[test]
    fn bm25search_delete_by_id() {
        let dir = TempDir::new().unwrap();
        let search = BM25Search::new(dir.path()).unwrap();
        search
            .add_documents(vec![
                (1, "hello world".to_string(), "a.rs".to_string()),
                (2, "goodbye world".to_string(), "b.rs".to_string()),
            ])
            .unwrap();
        search.delete_by_id(1).unwrap();
        // After deletion, searching for doc 1's unique term should not return id 1
        let results = search.search("hello", 10).unwrap();
        let ids: Vec<u64> = results.iter().map(|r| r.id).collect();
        assert!(!ids.contains(&1), "id 1 should be deleted");
    }

    #[test]
    fn bm25search_reopen_existing_index() {
        let dir = TempDir::new().unwrap();
        // Create and index
        {
            let search = BM25Search::new(dir.path()).unwrap();
            search
                .add_documents(vec![(
                    1,
                    "persistent content".to_string(),
                    "p.rs".to_string(),
                )])
                .unwrap();
        }
        // Reopen and verify docs persist
        let search2 = BM25Search::new(dir.path()).unwrap();
        let stats = search2.get_stats().unwrap();
        assert_eq!(stats.total_documents, 1);
    }
}