Skip to main content

reflex/
content_store.rs

1//! Content store for memory-mapped file access
2//!
3//! This module stores the full contents of all indexed files in a single
4//! memory-mapped file. This enables zero-copy access to file contents for:
5//! - Verifying trigram matches
6//! - Extracting context around matches
7//! - Fast content retrieval without disk I/O
8//!
9//! # Binary Format (content.bin)
10//!
11//! ```text
12//! Header (32 bytes):
13//!   magic: "RFCT" (4 bytes)
14//!   version: 1 (u32)
15//!   num_files: N (u64)
16//!   index_offset: offset to file index (u64)
17//!   reserved: 8 bytes
18//!
19//! File Contents (variable):
20//!   [Concatenated file contents]
21//!
22//! File Index (at index_offset):
23//!   For each file:
24//!     path_len: u32
25//!     path: UTF-8 string
26//!     offset: u64 (byte offset to file content)
27//!     length: u64 (file size in bytes)
28//! ```
29
30use anyhow::{Context, Result};
31use memmap2::Mmap;
32use std::fs::{File, OpenOptions};
33use std::io::Write;
34use std::path::{Path, PathBuf};
35
36const MAGIC: &[u8; 4] = b"RFCT";
37const VERSION: u32 = 1;
38const HEADER_SIZE: usize = 32; // 4 (magic) + 4 (version) + 8 (num_files) + 8 (index_offset) + 8 (reserved)
39
40/// Metadata for a file in the content store
41#[derive(Debug, Clone)]
42pub struct FileEntry {
43    /// File path
44    pub path: PathBuf,
45    /// Byte offset in content.bin where this file's content starts
46    pub offset: u64,
47    /// Length of this file's content in bytes
48    pub length: u64,
49}
50
51/// Writer for building content.bin
52///
53/// Supports two modes:
54/// 1. **Streaming mode** (init() called): Writes file contents to disk incrementally to avoid RAM buildup
55/// 2. **In-memory mode** (default): Accumulates content in RAM for backward compatibility with tests
56pub struct ContentWriter {
57    files: Vec<FileEntry>,
58    writer: Option<std::io::BufWriter<File>>,
59    current_offset: u64,
60    file_path: Option<PathBuf>,
61    // In-memory content buffer (only used if streaming mode not enabled)
62    content: Vec<u8>,
63}
64
65impl ContentWriter {
66    /// Create a new content writer (in-memory mode by default)
67    ///
68    /// Call init() to enable streaming mode before adding files.
69    pub fn new() -> Self {
70        Self {
71            files: Vec::new(),
72            writer: None,
73            current_offset: 0,
74            file_path: None,
75            content: Vec::new(),
76        }
77    }
78
79    /// Initialize the writer by creating the output file and writing header placeholder
80    pub fn init(&mut self, path: PathBuf) -> Result<()> {
81        let file = OpenOptions::new()
82            .create(true)
83            .write(true)
84            .truncate(true)
85            .open(&path)
86            .with_context(|| format!("Failed to create {}", path.display()))?;
87
88        // Use a large buffer (16MB) for better write performance
89        let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
90
91        // Write placeholder header (will be overwritten in finalize())
92        writer.write_all(MAGIC)?;
93        writer.write_all(&VERSION.to_le_bytes())?;
94        writer.write_all(&0u64.to_le_bytes())?; // num_files (placeholder)
95        writer.write_all(&0u64.to_le_bytes())?; // index_offset (placeholder)
96        writer.write_all(&[0u8; 8])?; // reserved
97
98        self.writer = Some(writer);
99        self.current_offset = 0; // Content starts after header
100        self.file_path = Some(path);
101
102        Ok(())
103    }
104
105    /// Add a file to the content store
106    ///
107    /// **Streaming mode** (if init() was called): Writes content to disk immediately.
108    /// **In-memory mode** (default): Accumulates content in RAM.
109    ///
110    /// Returns the file_id (index into files array)
111    pub fn add_file(&mut self, path: PathBuf, content: &str) -> u32 {
112        let file_id = self.files.len() as u32;
113        let content_bytes = content.as_bytes();
114        let length = content_bytes.len() as u64;
115
116        if let Some(ref mut w) = self.writer {
117            // Streaming mode: write content immediately to disk
118            let offset = self.current_offset;
119            w.write_all(content_bytes)
120                .expect("Failed to write file content to content.bin");
121            self.current_offset += length;
122
123            self.files.push(FileEntry {
124                path,
125                offset,
126                length,
127            });
128        } else {
129            // In-memory mode: accumulate in RAM (for backward compatibility)
130            let offset = self.content.len() as u64;
131            self.content.extend_from_slice(content_bytes);
132
133            self.files.push(FileEntry {
134                path,
135                offset,
136                length,
137            });
138        }
139
140        file_id
141    }
142
143    /// Write the content store to disk
144    ///
145    /// This is the main entry point for the old API. It initializes the writer (if needed),
146    /// and finalizes the file.
147    pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
148        let path = path.as_ref();
149
150        // Initialize writer if not already done
151        if self.writer.is_none() && self.file_path.is_none() {
152            // Old API: no files written yet, need to write them now in-memory
153            // This is a fallback for tests that don't call init()
154            return self.write_legacy(path);
155        }
156
157        // New streaming API: already been writing, just finalize
158        self.finalize_if_needed()?;
159
160        Ok(())
161    }
162
163    /// Legacy write path for in-memory mode (backward compatibility)
164    ///
165    /// This is only used when write() is called without init() first.
166    /// Content is accumulated in RAM and written all at once.
167    fn write_legacy(&self, path: impl AsRef<Path>) -> Result<()> {
168        let path = path.as_ref();
169        let file = OpenOptions::new()
170            .create(true)
171            .write(true)
172            .truncate(true)
173            .open(path)
174            .with_context(|| format!("Failed to create {}", path.display()))?;
175
176        // Use a large buffer (8MB) for better write performance
177        let mut writer = std::io::BufWriter::with_capacity(8 * 1024 * 1024, file);
178
179        // Calculate index offset (after header + content)
180        let index_offset = HEADER_SIZE as u64 + self.content.len() as u64;
181
182        // Write header
183        writer.write_all(MAGIC)?;
184        writer.write_all(&VERSION.to_le_bytes())?;
185        writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
186        writer.write_all(&index_offset.to_le_bytes())?;
187        writer.write_all(&[0u8; 8])?; // reserved
188
189        // Write all accumulated file contents
190        writer.write_all(&self.content)?;
191
192        // Write file index
193        for entry in &self.files {
194            let path_str = entry.path.to_string_lossy();
195            let path_bytes = path_str.as_bytes();
196
197            writer.write_all(&(path_bytes.len() as u32).to_le_bytes())?;
198            writer.write_all(path_bytes)?;
199            writer.write_all(&entry.offset.to_le_bytes())?;
200            writer.write_all(&entry.length.to_le_bytes())?;
201        }
202
203        writer.flush()?;
204        Ok(())
205    }
206
207    /// Finalize the content.bin file by writing the file index and updating the header
208    fn finalize(&mut self) -> Result<()> {
209        let mut writer = self.writer.take()
210            .ok_or_else(|| anyhow::anyhow!("ContentWriter not initialized"))?;
211
212        // Write file index at current position
213        let index_offset = HEADER_SIZE as u64 + self.current_offset;
214
215        for entry in &self.files {
216            let path_str = entry.path.to_string_lossy();
217            let path_bytes = path_str.as_bytes();
218
219            writer.write_all(&(path_bytes.len() as u32).to_le_bytes())?;
220            writer.write_all(path_bytes)?;
221            writer.write_all(&entry.offset.to_le_bytes())?;
222            writer.write_all(&entry.length.to_le_bytes())?;
223        }
224
225        // Consume BufWriter and get the underlying File
226        let mut file = writer.into_inner()
227            .map_err(|e| anyhow::anyhow!("Failed to flush BufWriter: {}", e.error()))?;
228
229        // Rewind to header and update with correct values
230        use std::io::Seek;
231        file.seek(std::io::SeekFrom::Start(0))?;
232
233        // Write correct header
234        file.write_all(MAGIC)?;
235        file.write_all(&VERSION.to_le_bytes())?;
236        file.write_all(&(self.files.len() as u64).to_le_bytes())?;
237        file.write_all(&index_offset.to_le_bytes())?;
238        file.write_all(&[0u8; 8])?; // reserved
239
240        // Final sync to disk
241        file.sync_all()?;
242
243        log::debug!(
244            "Finalized content.bin: {} files, {} bytes of content",
245            self.files.len(),
246            self.current_offset
247        );
248
249        Ok(())
250    }
251
252    /// Get the number of files
253    pub fn file_count(&self) -> usize {
254        self.files.len()
255    }
256
257    /// Get total content size
258    pub fn content_size(&self) -> usize {
259        if self.writer.is_some() || self.file_path.is_some() {
260            // Streaming mode
261            self.current_offset as usize
262        } else {
263            // In-memory mode
264            self.content.len()
265        }
266    }
267
268    /// Finalize content store if it hasn't been finalized yet
269    ///
270    /// This is safe to call multiple times - subsequent calls are no-ops.
271    pub fn finalize_if_needed(&mut self) -> Result<()> {
272        if self.writer.is_some() {
273            self.finalize()?;
274            // Clear writer to mark as finalized
275            self.writer = None;
276        }
277        Ok(())
278    }
279}
280
281impl Default for ContentWriter {
282    fn default() -> Self {
283        Self::new()
284    }
285}
286
287/// Reader for memory-mapped content.bin
288///
289/// Provides zero-copy access to file contents.
290pub struct ContentReader {
291    _file: File,
292    mmap: Mmap,
293    files: Vec<FileEntry>,
294}
295
296impl ContentReader {
297    /// Open and memory-map content.bin
298    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
299        let path = path.as_ref();
300
301        let file = File::open(path)
302            .with_context(|| format!("Failed to open {}", path.display()))?;
303
304        let mmap = unsafe {
305            Mmap::map(&file)
306                .with_context(|| format!("Failed to mmap {}", path.display()))?
307        };
308
309        // Validate header
310        if mmap.len() < HEADER_SIZE {
311            anyhow::bail!("content.bin too small (expected at least {} bytes)", HEADER_SIZE);
312        }
313
314        if &mmap[0..4] != MAGIC {
315            anyhow::bail!("Invalid content.bin (wrong magic bytes)");
316        }
317
318        let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
319        if version != VERSION {
320            anyhow::bail!("Unsupported content.bin version: {}", version);
321        }
322
323        let num_files = u64::from_le_bytes([
324            mmap[8], mmap[9], mmap[10], mmap[11],
325            mmap[12], mmap[13], mmap[14], mmap[15],
326        ]);
327
328        let index_offset = u64::from_le_bytes([
329            mmap[16], mmap[17], mmap[18], mmap[19],
330            mmap[20], mmap[21], mmap[22], mmap[23],
331        ]) as usize;
332
333        // Read file index
334        let mut files = Vec::new();
335        let mut pos = index_offset;
336
337        for i in 0..num_files {
338            if pos + 4 > mmap.len() {
339                anyhow::bail!("Truncated file index at file {} (pos={}, mmap.len()={})", i, pos, mmap.len());
340            }
341
342            let path_len = u32::from_le_bytes([
343                mmap[pos],
344                mmap[pos + 1],
345                mmap[pos + 2],
346                mmap[pos + 3],
347            ]) as usize;
348            pos += 4;
349
350            if pos + path_len + 16 > mmap.len() {
351                anyhow::bail!("Truncated file entry at file {} (pos={}, path_len={}, need={}, mmap.len()={})",
352                    i, pos, path_len, pos + path_len + 16, mmap.len());
353            }
354
355            let path_bytes = &mmap[pos..pos + path_len];
356            let path_str = std::str::from_utf8(path_bytes)
357                .context("Invalid UTF-8 in file path")?;
358            let path = PathBuf::from(path_str);
359            pos += path_len;
360
361            let offset = u64::from_le_bytes([
362                mmap[pos],
363                mmap[pos + 1],
364                mmap[pos + 2],
365                mmap[pos + 3],
366                mmap[pos + 4],
367                mmap[pos + 5],
368                mmap[pos + 6],
369                mmap[pos + 7],
370            ]);
371            pos += 8;
372
373            let length = u64::from_le_bytes([
374                mmap[pos],
375                mmap[pos + 1],
376                mmap[pos + 2],
377                mmap[pos + 3],
378                mmap[pos + 4],
379                mmap[pos + 5],
380                mmap[pos + 6],
381                mmap[pos + 7],
382            ]);
383            pos += 8;
384
385            files.push(FileEntry {
386                path,
387                offset,
388                length,
389            });
390        }
391
392        Ok(Self {
393            _file: file,
394            mmap,
395            files,
396        })
397    }
398
399    /// Get file content by file_id
400    pub fn get_file_content(&self, file_id: u32) -> Result<&str> {
401        let entry = self.files
402            .get(file_id as usize)
403            .ok_or_else(|| anyhow::anyhow!("Invalid file_id: {}", file_id))?;
404
405        let start = HEADER_SIZE + entry.offset as usize;
406        let end = start + entry.length as usize;
407
408        if end > self.mmap.len() {
409            anyhow::bail!("File content out of bounds");
410        }
411
412        let bytes = &self.mmap[start..end];
413        std::str::from_utf8(bytes).context("Invalid UTF-8 in file content")
414    }
415
416    /// Get file path by file_id
417    pub fn get_file_path(&self, file_id: u32) -> Option<&Path> {
418        self.files.get(file_id as usize).map(|e| e.path.as_path())
419    }
420
421    /// Get number of files
422    pub fn file_count(&self) -> usize {
423        self.files.len()
424    }
425
426    /// Get file_id (array index) by path
427    ///
428    /// This looks up a file by its path and returns the array index, which is the
429    /// correct file_id to use with get_file_content() and other methods.
430    ///
431    /// Note: This is different from database file_ids, which are AUTO INCREMENT values.
432    pub fn get_file_id_by_path(&self, path: &str) -> Option<u32> {
433        // Normalize the input path (strip ./ prefix if present)
434        let normalized_input = path.strip_prefix("./").unwrap_or(path);
435
436        self.files.iter().position(|entry| {
437            // Normalize the stored path (strip ./ prefix if present)
438            let stored_path = entry.path.to_string_lossy();
439            let normalized_stored = stored_path.strip_prefix("./").unwrap_or(&stored_path);
440            normalized_stored == normalized_input
441        }).map(|idx| idx as u32)
442    }
443
444    /// Get content at a specific byte offset
445    pub fn get_content_at_offset(&self, file_id: u32, byte_offset: u32, length: usize) -> Result<&str> {
446        let entry = self.files
447            .get(file_id as usize)
448            .ok_or_else(|| anyhow::anyhow!("Invalid file_id: {}", file_id))?;
449
450        let start = HEADER_SIZE + entry.offset as usize + byte_offset as usize;
451        let end = start + length;
452
453        if end > self.mmap.len() {
454            anyhow::bail!("Content out of bounds");
455        }
456
457        let bytes = &self.mmap[start..end];
458        std::str::from_utf8(bytes).context("Invalid UTF-8 in content")
459    }
460
461    /// Get context around a byte offset (for showing match results)
462    ///
463    /// Returns (lines_before, matching_line, lines_after)
464    pub fn get_context(&self, file_id: u32, byte_offset: u32, context_lines: usize) -> Result<(Vec<String>, String, Vec<String>)> {
465        let content = self.get_file_content(file_id)?;
466        let lines: Vec<&str> = content.lines().collect();
467
468        // Find which line contains this byte offset
469        let mut current_offset = 0;
470        let mut line_idx = 0;
471
472        for (idx, line) in lines.iter().enumerate() {
473            let line_end = current_offset + line.len() + 1; // +1 for newline
474            if byte_offset as usize >= current_offset && (byte_offset as usize) < line_end {
475                line_idx = idx;
476                break;
477            }
478            current_offset = line_end;
479        }
480
481        // Extract context
482        let start = line_idx.saturating_sub(context_lines);
483        let end = (line_idx + context_lines + 1).min(lines.len());
484
485        let before: Vec<String> = lines[start..line_idx]
486            .iter()
487            .map(|s| s.to_string())
488            .collect();
489
490        let matching = lines.get(line_idx)
491            .map(|s| s.to_string())
492            .unwrap_or_default();
493
494        let after: Vec<String> = lines[line_idx + 1..end]
495            .iter()
496            .map(|s| s.to_string())
497            .collect();
498
499        Ok((before, matching, after))
500    }
501
502    /// Get context around a specific line number (1-indexed)
503    ///
504    /// Returns (lines_before, lines_after)
505    pub fn get_context_by_line(&self, file_id: u32, line_number: usize, context_lines: usize) -> Result<(Vec<String>, Vec<String>)> {
506        let content = self.get_file_content(file_id)?;
507        let lines: Vec<&str> = content.lines().collect();
508
509        // Convert from 1-indexed to 0-indexed
510        let line_idx = line_number.saturating_sub(1);
511
512        // Extract context
513        let start = line_idx.saturating_sub(context_lines);
514        let end = (line_idx + context_lines + 1).min(lines.len());
515
516        let before: Vec<String> = lines[start..line_idx]
517            .iter()
518            .map(|s| s.to_string())
519            .collect();
520
521        let after: Vec<String> = lines[line_idx + 1..end]
522            .iter()
523            .map(|s| s.to_string())
524            .collect();
525
526        Ok((before, after))
527    }
528}
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533    use tempfile::TempDir;
534
535    #[test]
536    fn test_content_writer_basic() {
537        let mut writer = ContentWriter::new();
538
539        let file1_id = writer.add_file(PathBuf::from("test1.txt"), "Hello, world!");
540        let file2_id = writer.add_file(PathBuf::from("test2.txt"), "Goodbye, world!");
541
542        assert_eq!(file1_id, 0);
543        assert_eq!(file2_id, 1);
544        assert_eq!(writer.file_count(), 2);
545    }
546
547    #[test]
548    fn test_content_roundtrip() {
549        let temp = TempDir::new().unwrap();
550        let content_path = temp.path().join("content.bin");
551
552        // Write
553        let mut writer = ContentWriter::new();
554        writer.add_file(PathBuf::from("file1.txt"), "First file content");
555        writer.add_file(PathBuf::from("file2.txt"), "Second file content");
556        writer.write(&content_path).unwrap();
557
558        // Read
559        let reader = ContentReader::open(&content_path).unwrap();
560
561        assert_eq!(reader.file_count(), 2);
562        assert_eq!(reader.get_file_content(0).unwrap(), "First file content");
563        assert_eq!(reader.get_file_content(1).unwrap(), "Second file content");
564        assert_eq!(reader.get_file_path(0).unwrap(), Path::new("file1.txt"));
565        assert_eq!(reader.get_file_path(1).unwrap(), Path::new("file2.txt"));
566    }
567
568    #[test]
569    fn test_get_context() {
570        let temp = TempDir::new().unwrap();
571        let content_path = temp.path().join("content.bin");
572
573        let mut writer = ContentWriter::new();
574        writer.add_file(
575            PathBuf::from("test.txt"),
576            "Line 1\nLine 2\nLine 3 with match\nLine 4\nLine 5",
577        );
578        writer.write(&content_path).unwrap();
579
580        let reader = ContentReader::open(&content_path).unwrap();
581
582        // Byte offset of "Line 3" (14 = "Line 1\n" + "Line 2\n")
583        let (before, matching, after) = reader.get_context(0, 14, 1).unwrap();
584
585        assert_eq!(before.len(), 1);
586        assert_eq!(before[0], "Line 2");
587        assert_eq!(matching, "Line 3 with match");
588        assert_eq!(after.len(), 1);
589        assert_eq!(after[0], "Line 4");
590    }
591
592    #[test]
593    fn test_streaming_roundtrip() {
594        let temp = TempDir::new().unwrap();
595        let content_path = temp.path().join("content.bin");
596
597        // Use the streaming path: init() -> add_file() -> finalize_if_needed()
598        let mut writer = ContentWriter::new();
599        writer.init(content_path.clone()).unwrap();
600        writer.add_file(PathBuf::from("src/main.rs"), "fn main() {}\n");
601        writer.add_file(PathBuf::from("src/lib.rs"), "pub fn hello() -> &'static str { \"hi\" }\n");
602        writer.finalize_if_needed().unwrap();
603
604        // Verify the file can be read back correctly
605        let reader = ContentReader::open(&content_path).unwrap();
606        assert_eq!(reader.file_count(), 2);
607        assert_eq!(reader.get_file_content(0).unwrap(), "fn main() {}\n");
608        assert_eq!(reader.get_file_content(1).unwrap(), "pub fn hello() -> &'static str { \"hi\" }\n");
609        assert_eq!(reader.get_file_path(0).unwrap(), Path::new("src/main.rs"));
610        assert_eq!(reader.get_file_path(1).unwrap(), Path::new("src/lib.rs"));
611    }
612
613    #[test]
614    fn test_multiline_file() {
615        let temp = TempDir::new().unwrap();
616        let content_path = temp.path().join("content.bin");
617
618        let content = "fn main() {\n    println!(\"Hello\");\n}\n";
619
620        let mut writer = ContentWriter::new();
621        writer.add_file(PathBuf::from("main.rs"), content);
622        writer.write(&content_path).unwrap();
623
624        let reader = ContentReader::open(&content_path).unwrap();
625        assert_eq!(reader.get_file_content(0).unwrap(), content);
626    }
627}