greppy/index/
writer.rs

1use crate::core::error::{Error, Result};
2use crate::index::schema::IndexSchema;
3use crate::index::tantivy_index::TantivyIndex;
4use crate::parse::Chunk;
5use tantivy::{doc, IndexWriter as TantivyWriter, Term};
6
7/// Writer heap size - 50MB is reasonable for most projects
8/// This bounds Tantivy's internal memory usage
9const WRITER_HEAP_SIZE: usize = 50_000_000; // 50MB
10
11pub struct IndexWriter {
12    writer: TantivyWriter,
13    schema: IndexSchema,
14}
15
16impl IndexWriter {
17    /// Create a new index writer from an existing index
18    pub fn new(index: &TantivyIndex) -> Result<Self> {
19        let writer = index
20            .index
21            .writer(WRITER_HEAP_SIZE)
22            .map_err(|e| Error::IndexError {
23                message: e.to_string(),
24            })?;
25
26        Ok(Self {
27            writer,
28            schema: index.schema.clone(),
29        })
30    }
31
32    /// Add a chunk to the index
33    pub fn add_chunk(&mut self, chunk: &Chunk) -> Result<()> {
34        let doc = doc!(
35            self.schema.id => chunk.id(),
36            self.schema.path => chunk.path.clone(),
37            self.schema.content => chunk.content.clone(),
38            self.schema.symbol_name => chunk.symbol_name.clone().unwrap_or_default(),
39            self.schema.symbol_type => chunk.symbol_type.clone().unwrap_or_default(),
40            self.schema.start_line => chunk.start_line as u64,
41            self.schema.end_line => chunk.end_line as u64,
42            self.schema.language => chunk.language.clone(),
43            self.schema.file_hash => chunk.file_hash.clone()
44        );
45
46        self.writer.add_document(doc)?;
47        Ok(())
48    }
49
50    /// Delete all chunks for a given file path
51    ///
52    /// Used for incremental updates - delete old chunks before re-indexing.
53    /// This is O(1) in Tantivy - it marks documents as deleted without scanning.
54    #[inline]
55    pub fn delete_by_path(&mut self, path: &str) -> Result<()> {
56        let term = Term::from_field_text(self.schema.path, path);
57        self.writer.delete_term(term);
58        Ok(())
59    }
60
61    /// Commit changes and return a new writer
62    ///
63    /// This is used for periodic commits during large indexing operations
64    /// to prevent unbounded memory growth in Tantivy's internal buffers.
65    /// After commit, the old writer is consumed and a fresh one is returned.
66    pub fn commit_and_reopen(mut self, index: &TantivyIndex) -> Result<Self> {
67        self.writer.commit().map_err(|e| Error::IndexError {
68            message: e.to_string(),
69        })?;
70        // Drop old writer, create fresh one
71        drop(self.writer);
72        Self::new(index)
73    }
74
75    /// Commit changes (final commit, consumes writer)
76    pub fn commit(mut self) -> Result<()> {
77        self.writer.commit().map_err(|e| Error::IndexError {
78            message: e.to_string(),
79        })?;
80        Ok(())
81    }
82}