pleme-codesearch 0.1.142

Fast, local semantic code search powered by Rust — BM25, vector embeddings, tree-sitter AST
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
//! Tantivy-based full-text search store
//!
//! Provides BM25 full-text search for hybrid search with RRF fusion.
//!
//! # Architecture Note
//! Always use `FtsStore::new()` which opens in R/W mode. This ensures only one
//! connection type exists, avoiding Windows file locking issues between readers
//! and writers. The writer is lazy-initialized on first write operation.

use anyhow::{anyhow, Result};
use std::path::Path;
use tantivy::{
    collector::TopDocs,
    directory::MmapDirectory,
    merge_policy::NoMergePolicy,
    query::QueryParser,
    schema::{Field, NumericOptions, Schema, Value, STORED, STRING, TEXT},
    Index, IndexReader, IndexSettings, IndexWriter, TantivyDocument, Term,
};

use crate::chunker::ChunkKind;

/// Result from FTS search
#[derive(Debug, Clone)]
pub struct FtsResult {
    /// Chunk ID that matches
    pub chunk_id: u32,
    /// BM25 score from Tantivy
    pub score: f32,
}

/// Full-text search store using Tantivy
///
/// Single connection type that supports both read and write operations.
/// Writer is lazy-initialized on first write to avoid unnecessary locks.
pub struct FtsStore {
    index: Index,
    reader: IndexReader,
    writer: Option<IndexWriter>,
    #[allow(dead_code)]
    schema: Schema,
    // Field handles
    chunk_id_field: Field,
    content_field: Field,
    path_field: Field,
    signature_field: Field,
    kind_field: Field,
}

impl FtsStore {
    /// Create or open an FTS index at the given path.
    ///
    /// Opens in a mode that supports both reading and writing.
    /// Writer is lazy-initialized on first write operation.
    pub fn new(db_path: &Path) -> Result<Self> {
        let fts_path = db_path.join("fts");
        std::fs::create_dir_all(&fts_path)?;

        // Build schema
        let mut schema_builder = Schema::builder();

        // Chunk ID - stored and indexed for retrieval and deletion
        let chunk_id_field = schema_builder.add_u64_field(
            "chunk_id",
            NumericOptions::default().set_indexed().set_stored(),
        );

        // Content - full text indexed for BM25 search
        let content_field = schema_builder.add_text_field("content", TEXT);

        // Path - stored and string indexed for filtering
        let path_field = schema_builder.add_text_field("path", STRING | STORED);

        // Signature - indexed for function/method name search
        let signature_field = schema_builder.add_text_field("signature", TEXT);

        // Kind - stored for filtering (function, class, etc)
        let kind_field = schema_builder.add_text_field("kind", STRING | STORED);

        let schema = schema_builder.build();

        // Open or create index with retry logic for Windows file locking
        let index = Self::open_or_create_index_with_retry(&fts_path, &schema)?;

        // Create reader for searching
        let reader = index.reader()?;

        Ok(Self {
            index,
            reader,
            writer: None, // Lazy-initialized on first write
            schema,
            chunk_id_field,
            content_field,
            path_field,
            signature_field,
            kind_field,
        })
    }

    /// Create or open an FTS index with writer ready for indexing.
    ///
    /// Use this when you know you'll be writing immediately (e.g., during indexing).
    /// For search-only or mixed workloads, use `new()` instead.
    pub fn new_with_writer(db_path: &Path) -> Result<Self> {
        let mut store = Self::new(db_path)?;
        store.ensure_writer()?;
        Ok(store)
    }

    /// Open or create index with retry logic for Windows file locking issues
    fn open_or_create_index_with_retry(fts_path: &Path, schema: &Schema) -> Result<Index> {
        let max_retries = 3;
        let mut last_error: Option<String> = None;

        for attempt in 0..max_retries {
            if attempt > 0 {
                // Wait before retry (exponential backoff)
                std::thread::sleep(std::time::Duration::from_millis(100 * (1 << attempt)));
            }

            let result: Result<Index, _> = if fts_path.join("meta.json").exists() {
                Index::open_in_dir(fts_path).map_err(|e| e.to_string())
            } else {
                MmapDirectory::open(fts_path)
                    .map_err(|e| e.to_string())
                    .and_then(|dir| {
                        Index::create(dir, schema.clone(), IndexSettings::default())
                            .map_err(|e| e.to_string())
                    })
            };

            match result {
                Ok(index) => return Ok(index),
                Err(e) => {
                    last_error = Some(e);
                    // On Windows, try to clear lock files if permission denied
                    if attempt < max_retries - 1 {
                        Self::try_clear_lock_files(fts_path);
                    }
                }
            }
        }

        Err(anyhow!(
            "Failed to open FTS index after {} retries: {}",
            max_retries,
            last_error.unwrap_or_default()
        ))
    }

    /// Create writer with retry logic for Windows file locking issues
    /// Increased retry count and initial wait to handle slow file handle release
    fn create_writer_with_retry(index: &Index) -> Result<IndexWriter> {
        let max_retries = 5; // Increased from 3 to handle Windows timing issues
        let mut last_error: Option<String> = None;

        for attempt in 0..max_retries {
            if attempt > 0 {
                // Wait before retry (exponential backoff)
                // Increased initial wait from 100ms to 200ms for better Windows compatibility
                std::thread::sleep(std::time::Duration::from_millis(200 * (1 << attempt)));
            }

            // 50MB writer heap (tantivy default).
            //
            // CRITICAL: Set NoMergePolicy to prevent tantivy from spawning background
            // merge threads. On Windows, these threads encounter I/O errors (antivirus
            // interference, file locking on mmap'd segment files) which panic the merge
            // thread and kill the IndexWriter — causing the intermittent
            // "An index writer was killed" error (~1/5 indexing runs).
            //
            // With NoMergePolicy, all segment management is explicit: we accumulate
            // segments during indexing and they're consolidated at commit points.
            // This trades slightly more segments for 100% reliability.
            match index.writer(50_000_000) {
                Ok(writer) => {
                    writer.set_merge_policy(Box::new(NoMergePolicy));
                    return Ok(writer);
                }
                Err(e) => {
                    last_error = Some(e.to_string());
                }
            }
        }

        Err(anyhow!(
            "Failed to create FTS writer after {} retries: {}",
            max_retries,
            last_error.unwrap_or_default()
        ))
    }

    /// Try to clear stale lock files on Windows
    fn try_clear_lock_files(fts_path: &Path) {
        // Try to remove stale lock files
        let lock_files = [".tantivy-writer.lock", ".tantivy-meta.lock"];
        for lock_file in &lock_files {
            let lock_path = fts_path.join(lock_file);
            if lock_path.exists() {
                let _ = std::fs::remove_file(&lock_path);
            }
        }
    }

    /// Ensure writer is initialized for indexing
    fn ensure_writer(&mut self) -> Result<()> {
        if self.writer.is_none() {
            // Use retry logic for Windows file locking issues
            let writer = Self::create_writer_with_retry(&self.index)?;
            self.writer = Some(writer);
        }
        Ok(())
    }

    /// Add a chunk to the FTS index
    ///
    /// Includes writer recovery: if the writer was killed (e.g., by a background
    /// merge thread panic), it will be recreated and the operation retried once.
    pub fn add_chunk(
        &mut self,
        chunk_id: u32,
        content: &str,
        path: &str,
        signature: Option<&str>,
        kind: &str,
    ) -> Result<()> {
        self.ensure_writer()?;

        // Copy field handles before mutable borrow
        let chunk_id_field = self.chunk_id_field;
        let content_field = self.content_field;
        let path_field = self.path_field;
        let signature_field = self.signature_field;
        let kind_field = self.kind_field;

        let mut doc = TantivyDocument::new();
        doc.add_u64(chunk_id_field, chunk_id as u64);
        doc.add_text(content_field, content);
        doc.add_text(path_field, path);
        doc.add_text(kind_field, kind);
        if let Some(sig) = signature {
            doc.add_text(signature_field, sig);
        }

        let writer = self.writer.as_mut().unwrap();
        match writer.add_document(doc) {
            Ok(_) => Ok(()),
            Err(e) => {
                let error_str = e.to_string();
                if error_str.contains("writer was killed")
                    || error_str.contains("index writer was killed")
                {
                    tracing::debug!(
                        "FTS writer was killed, recreating and retrying add_chunk for chunk {}",
                        chunk_id
                    );

                    // Drop the dead writer and recreate
                    self.writer = None;
                    self.ensure_writer()?;

                    // Rebuild the document for retry
                    let mut retry_doc = TantivyDocument::new();
                    retry_doc.add_u64(chunk_id_field, chunk_id as u64);
                    retry_doc.add_text(content_field, content);
                    retry_doc.add_text(path_field, path);
                    retry_doc.add_text(kind_field, kind);
                    if let Some(sig) = signature {
                        retry_doc.add_text(signature_field, sig);
                    }

                    let writer = self.writer.as_mut().unwrap();
                    writer.add_document(retry_doc).map_err(|e| {
                        anyhow!("FTS add_document failed after writer recovery: {}", e)
                    })?;
                    Ok(())
                } else {
                    Err(anyhow!("FTS add_document failed: {}", error_str))
                }
            }
        }
    }

    /// Delete a chunk by ID
    pub fn delete_chunk(&mut self, chunk_id: u32) -> Result<()> {
        self.ensure_writer()?;
        let chunk_id_field = self.chunk_id_field;
        let writer = self.writer.as_mut().unwrap();
        let term = Term::from_field_u64(chunk_id_field, chunk_id as u64);
        writer.delete_term(term);
        Ok(())
    }

    /// Delete all chunks for a file path
    #[allow(dead_code)] // Reserved for file-level deletion
    pub fn delete_by_path(&mut self, path: &str) -> Result<()> {
        self.ensure_writer()?;
        let path_field = self.path_field;
        let writer = self.writer.as_mut().unwrap();
        let term = Term::from_field_text(path_field, path);
        writer.delete_term(term);
        Ok(())
    }

    /// Commit pending changes with retry logic for Windows file locking.
    ///
    /// If the writer was killed (background merge panic), it is recreated.
    /// Data since the last successful commit will be lost in that case, but
    /// indexing can continue rather than aborting entirely.
    pub fn commit(&mut self) -> Result<()> {
        if self.writer.is_none() {
            return Ok(());
        }

        let max_retries = 5;
        let mut last_error: Option<String> = None;

        for attempt in 0..max_retries {
            if attempt > 0 {
                // Wait before retry (exponential backoff: 100ms, 200ms, 400ms, 800ms)
                std::thread::sleep(std::time::Duration::from_millis(100 * (1 << attempt)));
            }

            let writer = self.writer.as_mut().unwrap();
            match writer.commit() {
                Ok(_) => {
                    // Reload reader to see changes
                    if let Err(e) = self.reader.reload() {
                        // Non-fatal: reader will eventually catch up
                        tracing::debug!("Reader reload warning: {}", e);
                    }
                    return Ok(());
                }
                Err(e) => {
                    let error_str = e.to_string();
                    last_error = Some(error_str.clone());

                    // Writer was killed by background thread panic — recreate it
                    if error_str.contains("writer was killed")
                        || error_str.contains("index writer was killed")
                    {
                        tracing::debug!(
                            "FTS writer was killed during commit (attempt {}/{}). \
                             Recreating writer. Data since last commit may be lost.",
                            attempt + 1,
                            max_retries
                        );
                        self.writer = None;
                        self.ensure_writer()?;
                        // After recreating, the pending data is gone, so commit
                        // the new (empty) writer to ensure a clean state
                        if let Some(ref mut w) = self.writer {
                            w.commit()
                                .map_err(|e| anyhow!("FTS commit after recovery failed: {}", e))?;
                        }
                        if let Err(e) = self.reader.reload() {
                            tracing::debug!("Reader reload warning: {}", e);
                        }
                        return Ok(());
                    }

                    // File locking error — retry with backoff
                    if error_str.contains("Access is denied")
                        || error_str.contains("PermissionDenied")
                        || error_str.contains("IoError")
                    {
                        tracing::debug!(
                            "FTS commit retry {}/{}: {}",
                            attempt + 1,
                            max_retries,
                            error_str
                        );
                        // Continue to retry
                    } else {
                        // Non-recoverable error, fail immediately
                        return Err(anyhow!("FTS commit failed: {}", error_str));
                    }
                }
            }
        }

        // All retries exhausted
        Err(anyhow!(
            "FTS commit failed after {} retries: {}",
            max_retries,
            last_error.unwrap_or_default()
        ))
    }

    /// Search using BM25
    ///
    /// If `target_kind` is provided, boosts results matching that ChunkKind (e.g., "class", "function").
    pub fn search(
        &self,
        query: &str,
        limit: usize,
        target_kind: Option<ChunkKind>,
    ) -> Result<Vec<FtsResult>> {
        let searcher = self.reader.searcher();

        // Parse query against content, signature, and kind fields
        let mut query_parser = QueryParser::for_index(
            &self.index,
            vec![self.content_field, self.signature_field, self.kind_field],
        );

        // Boost signature field for better matching of function names, class names, etc.
        query_parser.set_field_boost(self.signature_field, 2.0);

        // Boost kind field when structural intent is detected
        if let Some(ref _kind) = target_kind {
            query_parser.set_field_boost(self.kind_field, 3.0); // High boost for kind field
        }

        // Parse query, fall back to match-all on error
        let parsed_query = match query_parser.parse_query(query) {
            Ok(q) => q,
            Err(_) => {
                // Try escaping special characters
                let escaped = query.replace(
                    [
                        ':', '(', ')', '[', ']', '{', '}', '^', '"', '~', '*', '?', '\\', '/',
                    ],
                    " ",
                );
                query_parser.parse_query(&escaped)?
            }
        };

        // Execute search
        let top_docs = searcher.search(&parsed_query, &TopDocs::with_limit(limit))?;

        // Convert to results
        let mut results = Vec::with_capacity(top_docs.len());
        for (score, doc_address) in top_docs {
            let doc: TantivyDocument = searcher.doc(doc_address)?;

            if let Some(chunk_id) = doc.get_first(self.chunk_id_field) {
                if let Some(id) = chunk_id.as_u64() {
                    results.push(FtsResult {
                        chunk_id: id as u32,
                        score,
                    });
                }
            }
        }

        Ok(results)
    }

    /// Search for exact identifier matches (boosted)
    ///
    /// Used for improving exact name matching (e.g., "BaseRestClient", "UserService").
    ///
    /// If `target_kind` is provided, uses selective boosting:
    /// - When both identifier AND kind are present, applies MUST constraint: items must match
    ///   the identifier in the signature field AND the kind (prevents boosting ALL items of that kind)
    /// - Otherwise, uses standard boost on the kind field
    pub fn search_exact(
        &self,
        identifier: &str,
        limit: usize,
        target_kind: Option<ChunkKind>,
    ) -> Result<Vec<FtsResult>> {
        use tantivy::query::{BooleanQuery, BoostQuery, Occur, TermQuery};
        use tantivy::schema::IndexRecordOption;

        let searcher = self.reader.searcher();

        // Search signature field with exact term
        let term = Term::from_field_text(self.signature_field, identifier);
        let term_query = TermQuery::new(term, IndexRecordOption::Basic);

        // Also search content field for the identifier as a phrase
        let content_term = Term::from_field_text(self.content_field, identifier);
        let content_query = TermQuery::new(content_term, IndexRecordOption::Basic);

        // Boost signature matches 3x over content matches
        let boosted_sig = BoostQuery::new(Box::new(term_query), 3.0);

        // Build query based on whether we have both identifier and kind
        let combined = if let Some(ref kind) = target_kind {
            // SELECTIVE MODE: When both identifier AND kind are detected,
            // create a MUST query that requires BOTH conditions.
            // This prevents boosting ALL items of that kind (e.g., all enums when searching for "ChunkKind enum")
            let kind_str = format!("{:?}", kind);
            let kind_term = Term::from_field_text(self.kind_field, &kind_str);
            let kind_query = TermQuery::new(kind_term, IndexRecordOption::Basic);

            // Combine: (signature OR content) AND kind
            // This means: only items that match the identifier AND are of the target kind get boosted
            let sig_or_content =
                BooleanQuery::union(vec![Box::new(boosted_sig), Box::new(content_query)]);
            let mut and_queries: Vec<(Occur, Box<dyn tantivy::query::Query>)> = vec![];
            and_queries.push((Occur::Must, Box::new(sig_or_content)));
            and_queries.push((Occur::Must, Box::new(kind_query)));
            BooleanQuery::new(and_queries)
        } else {
            // STANDARD MODE: Just search for the identifier in signature and content
            BooleanQuery::union(vec![Box::new(boosted_sig), Box::new(content_query)])
        };

        let top_docs = searcher.search(&combined, &TopDocs::with_limit(limit))?;

        // Convert to results
        let mut results = Vec::with_capacity(top_docs.len());
        for (score, doc_address) in top_docs {
            let doc: TantivyDocument = searcher.doc(doc_address)?;

            if let Some(chunk_id) = doc.get_first(self.chunk_id_field) {
                if let Some(id) = chunk_id.as_u64() {
                    results.push(FtsResult {
                        chunk_id: id as u32,
                        score,
                    });
                }
            }
        }

        Ok(results)
    }

    /// Get statistics about the index
    pub fn stats(&self) -> Result<FtsStats> {
        let searcher = self.reader.searcher();
        let num_docs = searcher.num_docs() as usize;

        Ok(FtsStats {
            num_documents: num_docs,
        })
    }

    /// Clear the entire index
    #[allow(dead_code)] // Reserved for index reset
    pub fn clear(&mut self) -> Result<()> {
        self.ensure_writer()?;
        let writer = self.writer.as_mut().unwrap();
        writer.delete_all_documents()?;
        writer.commit()?;
        self.reader.reload()?;
        Ok(())
    }
}

/// Statistics about the FTS index
#[derive(Debug, Clone)]
#[allow(dead_code)] // Part of public API for debugging/monitoring
pub struct FtsStats {
    #[allow(dead_code)] // Part of public API for debugging/monitoring
    pub num_documents: usize,
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::tempdir;

    #[test]
    fn test_fts_basic() -> Result<()> {
        let dir = tempdir()?;
        let db_path = dir.path().to_path_buf();

        let mut store = FtsStore::new(&db_path)?;

        // Add some chunks
        store.add_chunk(
            1,
            "fn hello_world() { println!(\"Hello!\"); }",
            "src/main.rs",
            Some("hello_world"),
            "function",
        )?;
        store.add_chunk(
            2,
            "struct UserConfig { name: String, age: u32 }",
            "src/config.rs",
            Some("UserConfig"),
            "struct",
        )?;
        store.add_chunk(
            3,
            "fn process_data(data: Vec<u8>) -> Result<()>",
            "src/processor.rs",
            Some("process_data"),
            "function",
        )?;

        store.commit()?;

        // Search for hello
        let results = store.search("hello", 10, None)?;
        assert!(!results.is_empty());
        assert_eq!(results[0].chunk_id, 1);

        // Search for UserConfig
        let results = store.search("UserConfig", 10, None)?;
        assert!(!results.is_empty());
        assert_eq!(results[0].chunk_id, 2);

        // Search for process
        let results = store.search("process data", 10, None)?;
        assert!(!results.is_empty());
        assert_eq!(results[0].chunk_id, 3);

        Ok(())
    }

    #[test]
    fn test_fts_delete() -> Result<()> {
        let dir = tempdir()?;
        let db_path = dir.path().to_path_buf();

        let mut store = FtsStore::new(&db_path)?;

        store.add_chunk(1, "test content one", "file1.rs", None, "block")?;
        store.add_chunk(2, "test content two", "file2.rs", None, "block")?;
        store.commit()?;

        // Should find both
        let results = store.search("test content", 10, None)?;
        assert_eq!(results.len(), 2);

        // Delete one
        store.delete_chunk(1)?;
        store.commit()?;

        // Should find only one
        let results = store.search("test content", 10, None)?;
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].chunk_id, 2);

        Ok(())
    }
}