reflex/
content_store.rs

1//! Content store for memory-mapped file access
2//!
3//! This module stores the full contents of all indexed files in a single
4//! memory-mapped file. This enables zero-copy access to file contents for:
5//! - Verifying trigram matches
6//! - Extracting context around matches
7//! - Fast content retrieval without disk I/O
8//!
9//! # Binary Format (content.bin)
10//!
11//! ```text
12//! Header (32 bytes):
13//!   magic: "RFCT" (4 bytes)
14//!   version: 1 (u32)
15//!   num_files: N (u64)
16//!   index_offset: offset to file index (u64)
17//!   reserved: 8 bytes
18//!
19//! File Contents (variable):
20//!   [Concatenated file contents]
21//!
22//! File Index (at index_offset):
23//!   For each file:
24//!     path_len: u32
25//!     path: UTF-8 string
26//!     offset: u64 (byte offset to file content)
27//!     length: u64 (file size in bytes)
28//! ```
29
30use anyhow::{Context, Result};
31use memmap2::Mmap;
32use std::fs::{File, OpenOptions};
33use std::io::Write;
34use std::path::{Path, PathBuf};
35
36const MAGIC: &[u8; 4] = b"RFCT";
37const VERSION: u32 = 1;
38const HEADER_SIZE: usize = 32; // 4 (magic) + 4 (version) + 8 (num_files) + 8 (index_offset) + 8 (reserved)
39
40/// Metadata for a file in the content store
41#[derive(Debug, Clone)]
42pub struct FileEntry {
43    /// File path
44    pub path: PathBuf,
45    /// Byte offset in content.bin where this file's content starts
46    pub offset: u64,
47    /// Length of this file's content in bytes
48    pub length: u64,
49}
50
51/// Writer for building content.bin
52///
53/// Supports two modes:
54/// 1. **Streaming mode** (init() called): Writes file contents to disk incrementally to avoid RAM buildup
55/// 2. **In-memory mode** (default): Accumulates content in RAM for backward compatibility with tests
56pub struct ContentWriter {
57    files: Vec<FileEntry>,
58    writer: Option<std::io::BufWriter<File>>,
59    current_offset: u64,
60    file_path: Option<PathBuf>,
61    // In-memory content buffer (only used if streaming mode not enabled)
62    content: Vec<u8>,
63}
64
65impl ContentWriter {
66    /// Create a new content writer (in-memory mode by default)
67    ///
68    /// Call init() to enable streaming mode before adding files.
69    pub fn new() -> Self {
70        Self {
71            files: Vec::new(),
72            writer: None,
73            current_offset: 0,
74            file_path: None,
75            content: Vec::new(),
76        }
77    }
78
79    /// Initialize the writer by creating the output file and writing header placeholder
80    pub fn init(&mut self, path: PathBuf) -> Result<()> {
81        let file = OpenOptions::new()
82            .create(true)
83            .write(true)
84            .truncate(true)
85            .open(&path)
86            .with_context(|| format!("Failed to create {}", path.display()))?;
87
88        // Use a large buffer (16MB) for better write performance
89        let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
90
91        // Write placeholder header (will be overwritten in finalize())
92        writer.write_all(MAGIC)?;
93        writer.write_all(&VERSION.to_le_bytes())?;
94        writer.write_all(&0u64.to_le_bytes())?; // num_files (placeholder)
95        writer.write_all(&0u64.to_le_bytes())?; // index_offset (placeholder)
96        writer.write_all(&[0u8; 8])?; // reserved
97
98        self.writer = Some(writer);
99        self.current_offset = 0; // Content starts after header
100        self.file_path = Some(path);
101
102        Ok(())
103    }
104
105    /// Add a file to the content store
106    ///
107    /// **Streaming mode** (if init() was called): Writes content to disk immediately.
108    /// **In-memory mode** (default): Accumulates content in RAM.
109    ///
110    /// Returns the file_id (index into files array)
111    pub fn add_file(&mut self, path: PathBuf, content: &str) -> u32 {
112        let file_id = self.files.len() as u32;
113        let content_bytes = content.as_bytes();
114        let length = content_bytes.len() as u64;
115
116        if let Some(ref mut w) = self.writer {
117            // Streaming mode: write content immediately to disk
118            let offset = self.current_offset;
119            w.write_all(content_bytes)
120                .expect("Failed to write file content to content.bin");
121            self.current_offset += length;
122
123            self.files.push(FileEntry {
124                path,
125                offset,
126                length,
127            });
128        } else {
129            // In-memory mode: accumulate in RAM (for backward compatibility)
130            let offset = self.content.len() as u64;
131            self.content.extend_from_slice(content_bytes);
132
133            self.files.push(FileEntry {
134                path,
135                offset,
136                length,
137            });
138        }
139
140        file_id
141    }
142
143    /// Write the content store to disk
144    ///
145    /// This is the main entry point for the old API. It initializes the writer (if needed),
146    /// and finalizes the file.
147    pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
148        let path = path.as_ref();
149
150        // Initialize writer if not already done
151        if self.writer.is_none() && self.file_path.is_none() {
152            // Old API: no files written yet, need to write them now in-memory
153            // This is a fallback for tests that don't call init()
154            return self.write_legacy(path);
155        }
156
157        // New streaming API: already been writing, just finalize
158        self.finalize_if_needed()?;
159
160        Ok(())
161    }
162
163    /// Legacy write path for in-memory mode (backward compatibility)
164    ///
165    /// This is only used when write() is called without init() first.
166    /// Content is accumulated in RAM and written all at once.
167    fn write_legacy(&self, path: impl AsRef<Path>) -> Result<()> {
168        let path = path.as_ref();
169        let file = OpenOptions::new()
170            .create(true)
171            .write(true)
172            .truncate(true)
173            .open(path)
174            .with_context(|| format!("Failed to create {}", path.display()))?;
175
176        // Use a large buffer (8MB) for better write performance
177        let mut writer = std::io::BufWriter::with_capacity(8 * 1024 * 1024, file);
178
179        // Calculate index offset (after header + content)
180        let index_offset = HEADER_SIZE as u64 + self.content.len() as u64;
181
182        // Write header
183        writer.write_all(MAGIC)?;
184        writer.write_all(&VERSION.to_le_bytes())?;
185        writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
186        writer.write_all(&index_offset.to_le_bytes())?;
187        writer.write_all(&[0u8; 8])?; // reserved
188
189        // Write all accumulated file contents
190        writer.write_all(&self.content)?;
191
192        // Write file index
193        for entry in &self.files {
194            let path_str = entry.path.to_string_lossy();
195            let path_bytes = path_str.as_bytes();
196
197            writer.write_all(&(path_bytes.len() as u32).to_le_bytes())?;
198            writer.write_all(path_bytes)?;
199            writer.write_all(&entry.offset.to_le_bytes())?;
200            writer.write_all(&entry.length.to_le_bytes())?;
201        }
202
203        writer.flush()?;
204        Ok(())
205    }
206
207    /// Finalize the content.bin file by writing the file index and updating the header
208    fn finalize(&mut self) -> Result<()> {
209        let writer = self.writer.as_mut()
210            .ok_or_else(|| anyhow::anyhow!("ContentWriter not initialized"))?;
211
212        // Write file index at current position
213        let index_offset = HEADER_SIZE as u64 + self.current_offset;
214
215        for entry in &self.files {
216            let path_str = entry.path.to_string_lossy();
217            let path_bytes = path_str.as_bytes();
218
219            writer.write_all(&(path_bytes.len() as u32).to_le_bytes())?;
220            writer.write_all(path_bytes)?;
221            writer.write_all(&entry.offset.to_le_bytes())?;
222            writer.write_all(&entry.length.to_le_bytes())?;
223        }
224
225        // Flush all writes
226        writer.flush()?;
227
228        // Get mutable reference to underlying file
229        let file = writer.get_mut();
230
231        // Rewind to header and update with correct values
232        use std::io::Seek;
233        file.seek(std::io::SeekFrom::Start(0))?;
234
235        // Write correct header
236        file.write_all(MAGIC)?;
237        file.write_all(&VERSION.to_le_bytes())?;
238        file.write_all(&(self.files.len() as u64).to_le_bytes())?;
239        file.write_all(&index_offset.to_le_bytes())?;
240        file.write_all(&[0u8; 8])?; // reserved
241
242        // Final sync to disk
243        file.sync_all()?;
244
245        log::debug!(
246            "Finalized content.bin: {} files, {} bytes of content",
247            self.files.len(),
248            self.current_offset
249        );
250
251        Ok(())
252    }
253
254    /// Get the number of files
255    pub fn file_count(&self) -> usize {
256        self.files.len()
257    }
258
259    /// Get total content size
260    pub fn content_size(&self) -> usize {
261        if self.writer.is_some() || self.file_path.is_some() {
262            // Streaming mode
263            self.current_offset as usize
264        } else {
265            // In-memory mode
266            self.content.len()
267        }
268    }
269
270    /// Finalize content store if it hasn't been finalized yet
271    ///
272    /// This is safe to call multiple times - subsequent calls are no-ops.
273    pub fn finalize_if_needed(&mut self) -> Result<()> {
274        if self.writer.is_some() {
275            self.finalize()?;
276            // Clear writer to mark as finalized
277            self.writer = None;
278        }
279        Ok(())
280    }
281}
282
283impl Default for ContentWriter {
284    fn default() -> Self {
285        Self::new()
286    }
287}
288
289/// Reader for memory-mapped content.bin
290///
291/// Provides zero-copy access to file contents.
292pub struct ContentReader {
293    _file: File,
294    mmap: Mmap,
295    files: Vec<FileEntry>,
296}
297
298impl ContentReader {
299    /// Open and memory-map content.bin
300    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
301        let path = path.as_ref();
302
303        let file = File::open(path)
304            .with_context(|| format!("Failed to open {}", path.display()))?;
305
306        let mmap = unsafe {
307            Mmap::map(&file)
308                .with_context(|| format!("Failed to mmap {}", path.display()))?
309        };
310
311        // Validate header
312        if mmap.len() < HEADER_SIZE {
313            anyhow::bail!("content.bin too small (expected at least {} bytes)", HEADER_SIZE);
314        }
315
316        if &mmap[0..4] != MAGIC {
317            anyhow::bail!("Invalid content.bin (wrong magic bytes)");
318        }
319
320        let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
321        if version != VERSION {
322            anyhow::bail!("Unsupported content.bin version: {}", version);
323        }
324
325        let num_files = u64::from_le_bytes([
326            mmap[8], mmap[9], mmap[10], mmap[11],
327            mmap[12], mmap[13], mmap[14], mmap[15],
328        ]);
329
330        let index_offset = u64::from_le_bytes([
331            mmap[16], mmap[17], mmap[18], mmap[19],
332            mmap[20], mmap[21], mmap[22], mmap[23],
333        ]) as usize;
334
335        // Read file index
336        let mut files = Vec::new();
337        let mut pos = index_offset;
338
339        for i in 0..num_files {
340            if pos + 4 > mmap.len() {
341                anyhow::bail!("Truncated file index at file {} (pos={}, mmap.len()={})", i, pos, mmap.len());
342            }
343
344            let path_len = u32::from_le_bytes([
345                mmap[pos],
346                mmap[pos + 1],
347                mmap[pos + 2],
348                mmap[pos + 3],
349            ]) as usize;
350            pos += 4;
351
352            if pos + path_len + 16 > mmap.len() {
353                anyhow::bail!("Truncated file entry at file {} (pos={}, path_len={}, need={}, mmap.len()={})",
354                    i, pos, path_len, pos + path_len + 16, mmap.len());
355            }
356
357            let path_bytes = &mmap[pos..pos + path_len];
358            let path_str = std::str::from_utf8(path_bytes)
359                .context("Invalid UTF-8 in file path")?;
360            let path = PathBuf::from(path_str);
361            pos += path_len;
362
363            let offset = u64::from_le_bytes([
364                mmap[pos],
365                mmap[pos + 1],
366                mmap[pos + 2],
367                mmap[pos + 3],
368                mmap[pos + 4],
369                mmap[pos + 5],
370                mmap[pos + 6],
371                mmap[pos + 7],
372            ]);
373            pos += 8;
374
375            let length = u64::from_le_bytes([
376                mmap[pos],
377                mmap[pos + 1],
378                mmap[pos + 2],
379                mmap[pos + 3],
380                mmap[pos + 4],
381                mmap[pos + 5],
382                mmap[pos + 6],
383                mmap[pos + 7],
384            ]);
385            pos += 8;
386
387            files.push(FileEntry {
388                path,
389                offset,
390                length,
391            });
392        }
393
394        Ok(Self {
395            _file: file,
396            mmap,
397            files,
398        })
399    }
400
401    /// Get file content by file_id
402    pub fn get_file_content(&self, file_id: u32) -> Result<&str> {
403        let entry = self.files
404            .get(file_id as usize)
405            .ok_or_else(|| anyhow::anyhow!("Invalid file_id: {}", file_id))?;
406
407        let start = HEADER_SIZE + entry.offset as usize;
408        let end = start + entry.length as usize;
409
410        if end > self.mmap.len() {
411            anyhow::bail!("File content out of bounds");
412        }
413
414        let bytes = &self.mmap[start..end];
415        std::str::from_utf8(bytes).context("Invalid UTF-8 in file content")
416    }
417
418    /// Get file path by file_id
419    pub fn get_file_path(&self, file_id: u32) -> Option<&Path> {
420        self.files.get(file_id as usize).map(|e| e.path.as_path())
421    }
422
423    /// Get number of files
424    pub fn file_count(&self) -> usize {
425        self.files.len()
426    }
427
428    /// Get file_id (array index) by path
429    ///
430    /// This looks up a file by its path and returns the array index, which is the
431    /// correct file_id to use with get_file_content() and other methods.
432    ///
433    /// Note: This is different from database file_ids, which are AUTO INCREMENT values.
434    pub fn get_file_id_by_path(&self, path: &str) -> Option<u32> {
435        // Normalize the input path (strip ./ prefix if present)
436        let normalized_input = path.strip_prefix("./").unwrap_or(path);
437
438        self.files.iter().position(|entry| {
439            // Normalize the stored path (strip ./ prefix if present)
440            let stored_path = entry.path.to_string_lossy();
441            let normalized_stored = stored_path.strip_prefix("./").unwrap_or(&stored_path);
442            normalized_stored == normalized_input
443        }).map(|idx| idx as u32)
444    }
445
446    /// Get content at a specific byte offset
447    pub fn get_content_at_offset(&self, file_id: u32, byte_offset: u32, length: usize) -> Result<&str> {
448        let entry = self.files
449            .get(file_id as usize)
450            .ok_or_else(|| anyhow::anyhow!("Invalid file_id: {}", file_id))?;
451
452        let start = HEADER_SIZE + entry.offset as usize + byte_offset as usize;
453        let end = start + length;
454
455        if end > self.mmap.len() {
456            anyhow::bail!("Content out of bounds");
457        }
458
459        let bytes = &self.mmap[start..end];
460        std::str::from_utf8(bytes).context("Invalid UTF-8 in content")
461    }
462
463    /// Get context around a byte offset (for showing match results)
464    ///
465    /// Returns (lines_before, matching_line, lines_after)
466    pub fn get_context(&self, file_id: u32, byte_offset: u32, context_lines: usize) -> Result<(Vec<String>, String, Vec<String>)> {
467        let content = self.get_file_content(file_id)?;
468        let lines: Vec<&str> = content.lines().collect();
469
470        // Find which line contains this byte offset
471        let mut current_offset = 0;
472        let mut line_idx = 0;
473
474        for (idx, line) in lines.iter().enumerate() {
475            let line_end = current_offset + line.len() + 1; // +1 for newline
476            if byte_offset as usize >= current_offset && (byte_offset as usize) < line_end {
477                line_idx = idx;
478                break;
479            }
480            current_offset = line_end;
481        }
482
483        // Extract context
484        let start = line_idx.saturating_sub(context_lines);
485        let end = (line_idx + context_lines + 1).min(lines.len());
486
487        let before: Vec<String> = lines[start..line_idx]
488            .iter()
489            .map(|s| s.to_string())
490            .collect();
491
492        let matching = lines.get(line_idx)
493            .map(|s| s.to_string())
494            .unwrap_or_default();
495
496        let after: Vec<String> = lines[line_idx + 1..end]
497            .iter()
498            .map(|s| s.to_string())
499            .collect();
500
501        Ok((before, matching, after))
502    }
503
504    /// Get context around a specific line number (1-indexed)
505    ///
506    /// Returns (lines_before, lines_after)
507    pub fn get_context_by_line(&self, file_id: u32, line_number: usize, context_lines: usize) -> Result<(Vec<String>, Vec<String>)> {
508        let content = self.get_file_content(file_id)?;
509        let lines: Vec<&str> = content.lines().collect();
510
511        // Convert from 1-indexed to 0-indexed
512        let line_idx = line_number.saturating_sub(1);
513
514        // Extract context
515        let start = line_idx.saturating_sub(context_lines);
516        let end = (line_idx + context_lines + 1).min(lines.len());
517
518        let before: Vec<String> = lines[start..line_idx]
519            .iter()
520            .map(|s| s.to_string())
521            .collect();
522
523        let after: Vec<String> = lines[line_idx + 1..end]
524            .iter()
525            .map(|s| s.to_string())
526            .collect();
527
528        Ok((before, after))
529    }
530}
531
532#[cfg(test)]
533mod tests {
534    use super::*;
535    use tempfile::TempDir;
536
537    #[test]
538    fn test_content_writer_basic() {
539        let mut writer = ContentWriter::new();
540
541        let file1_id = writer.add_file(PathBuf::from("test1.txt"), "Hello, world!");
542        let file2_id = writer.add_file(PathBuf::from("test2.txt"), "Goodbye, world!");
543
544        assert_eq!(file1_id, 0);
545        assert_eq!(file2_id, 1);
546        assert_eq!(writer.file_count(), 2);
547    }
548
549    #[test]
550    fn test_content_roundtrip() {
551        let temp = TempDir::new().unwrap();
552        let content_path = temp.path().join("content.bin");
553
554        // Write
555        let mut writer = ContentWriter::new();
556        writer.add_file(PathBuf::from("file1.txt"), "First file content");
557        writer.add_file(PathBuf::from("file2.txt"), "Second file content");
558        writer.write(&content_path).unwrap();
559
560        // Read
561        let reader = ContentReader::open(&content_path).unwrap();
562
563        assert_eq!(reader.file_count(), 2);
564        assert_eq!(reader.get_file_content(0).unwrap(), "First file content");
565        assert_eq!(reader.get_file_content(1).unwrap(), "Second file content");
566        assert_eq!(reader.get_file_path(0).unwrap(), Path::new("file1.txt"));
567        assert_eq!(reader.get_file_path(1).unwrap(), Path::new("file2.txt"));
568    }
569
570    #[test]
571    fn test_get_context() {
572        let temp = TempDir::new().unwrap();
573        let content_path = temp.path().join("content.bin");
574
575        let mut writer = ContentWriter::new();
576        writer.add_file(
577            PathBuf::from("test.txt"),
578            "Line 1\nLine 2\nLine 3 with match\nLine 4\nLine 5",
579        );
580        writer.write(&content_path).unwrap();
581
582        let reader = ContentReader::open(&content_path).unwrap();
583
584        // Byte offset of "Line 3" (14 = "Line 1\n" + "Line 2\n")
585        let (before, matching, after) = reader.get_context(0, 14, 1).unwrap();
586
587        assert_eq!(before.len(), 1);
588        assert_eq!(before[0], "Line 2");
589        assert_eq!(matching, "Line 3 with match");
590        assert_eq!(after.len(), 1);
591        assert_eq!(after[0], "Line 4");
592    }
593
594    #[test]
595    fn test_multiline_file() {
596        let temp = TempDir::new().unwrap();
597        let content_path = temp.path().join("content.bin");
598
599        let content = "fn main() {\n    println!(\"Hello\");\n}\n";
600
601        let mut writer = ContentWriter::new();
602        writer.add_file(PathBuf::from("main.rs"), content);
603        writer.write(&content_path).unwrap();
604
605        let reader = ContentReader::open(&content_path).unwrap();
606        assert_eq!(reader.get_file_content(0).unwrap(), content);
607    }
608}