Skip to main content

reflex/
content_store.rs

1//! Content store for memory-mapped file access
2//!
3//! This module stores the full contents of all indexed files in a single
4//! memory-mapped file. This enables zero-copy access to file contents for:
5//! - Verifying trigram matches
6//! - Extracting context around matches
7//! - Fast content retrieval without disk I/O
8//!
9//! # Binary Format (content.bin)
10//!
11//! ```text
12//! Header (32 bytes):
13//!   magic: "RFCT" (4 bytes)
14//!   version: 1 (u32)
15//!   num_files: N (u64)
16//!   index_offset: offset to file index (u64)
17//!   reserved: 8 bytes
18//!
19//! File Contents (variable):
20//!   [Concatenated file contents]
21//!
22//! File Index (at index_offset):
23//!   For each file:
24//!     path_len: u32
25//!     path: UTF-8 string
26//!     offset: u64 (byte offset to file content)
27//!     length: u64 (file size in bytes)
28//! ```
29
30use anyhow::{Context, Result};
31use memmap2::Mmap;
32use std::fs::{File, OpenOptions};
33use std::io::Write;
34use std::path::{Path, PathBuf};
35
36const MAGIC: &[u8; 4] = b"RFCT";
37const VERSION: u32 = 1;
38const HEADER_SIZE: usize = 32; // 4 (magic) + 4 (version) + 8 (num_files) + 8 (index_offset) + 8 (reserved)
39
40/// Metadata for a file in the content store
41#[derive(Debug, Clone)]
42pub struct FileEntry {
43    /// File path
44    pub path: PathBuf,
45    /// Byte offset in content.bin where this file's content starts
46    pub offset: u64,
47    /// Length of this file's content in bytes
48    pub length: u64,
49}
50
51/// Writer for building content.bin
52///
53/// Supports two modes:
54/// 1. **Streaming mode** (init() called): Writes file contents to disk incrementally to avoid RAM buildup
55/// 2. **In-memory mode** (default): Accumulates content in RAM for backward compatibility with tests
56pub struct ContentWriter {
57    files: Vec<FileEntry>,
58    writer: Option<std::io::BufWriter<File>>,
59    current_offset: u64,
60    file_path: Option<PathBuf>,
61    // In-memory content buffer (only used if streaming mode not enabled)
62    content: Vec<u8>,
63}
64
65impl ContentWriter {
66    /// Create a new content writer (in-memory mode by default)
67    ///
68    /// Call init() to enable streaming mode before adding files.
69    pub fn new() -> Self {
70        Self {
71            files: Vec::new(),
72            writer: None,
73            current_offset: 0,
74            file_path: None,
75            content: Vec::new(),
76        }
77    }
78
79    /// Initialize the writer by creating the output file and writing header placeholder
80    pub fn init(&mut self, path: PathBuf) -> Result<()> {
81        let file = OpenOptions::new()
82            .create(true)
83            .write(true)
84            .truncate(true)
85            .open(&path)
86            .with_context(|| format!("Failed to create {}", path.display()))?;
87
88        // Use a large buffer (16MB) for better write performance
89        let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
90
91        // Write placeholder header (will be overwritten in finalize())
92        writer.write_all(MAGIC)?;
93        writer.write_all(&VERSION.to_le_bytes())?;
94        writer.write_all(&0u64.to_le_bytes())?; // num_files (placeholder)
95        writer.write_all(&0u64.to_le_bytes())?; // index_offset (placeholder)
96        writer.write_all(&[0u8; 8])?; // reserved
97
98        self.writer = Some(writer);
99        self.current_offset = 0; // Content starts after header
100        self.file_path = Some(path);
101
102        Ok(())
103    }
104
105    /// Add a file to the content store
106    ///
107    /// **Streaming mode** (if init() was called): Writes content to disk immediately.
108    /// **In-memory mode** (default): Accumulates content in RAM.
109    ///
110    /// Returns the file_id (index into files array)
111    pub fn add_file(&mut self, path: PathBuf, content: &str) -> u32 {
112        let file_id = self.files.len() as u32;
113        let content_bytes = content.as_bytes();
114        let length = content_bytes.len() as u64;
115
116        if let Some(ref mut w) = self.writer {
117            // Streaming mode: write content immediately to disk
118            let offset = self.current_offset;
119            w.write_all(content_bytes)
120                .expect("Failed to write file content to content.bin");
121            self.current_offset += length;
122
123            self.files.push(FileEntry {
124                path,
125                offset,
126                length,
127            });
128        } else {
129            // In-memory mode: accumulate in RAM (for backward compatibility)
130            let offset = self.content.len() as u64;
131            self.content.extend_from_slice(content_bytes);
132
133            self.files.push(FileEntry {
134                path,
135                offset,
136                length,
137            });
138        }
139
140        file_id
141    }
142
143    /// Write the content store to disk
144    ///
145    /// This is the main entry point for the old API. It initializes the writer (if needed),
146    /// and finalizes the file.
147    pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
148        let path = path.as_ref();
149
150        // Initialize writer if not already done
151        if self.writer.is_none() && self.file_path.is_none() {
152            // Old API: no files written yet, need to write them now in-memory
153            // This is a fallback for tests that don't call init()
154            return self.write_legacy(path);
155        }
156
157        // New streaming API: already been writing, just finalize
158        self.finalize_if_needed()?;
159
160        Ok(())
161    }
162
163    /// Legacy write path for in-memory mode (backward compatibility)
164    ///
165    /// This is only used when write() is called without init() first.
166    /// Content is accumulated in RAM and written all at once.
167    fn write_legacy(&self, path: impl AsRef<Path>) -> Result<()> {
168        let path = path.as_ref();
169        let file = OpenOptions::new()
170            .create(true)
171            .write(true)
172            .truncate(true)
173            .open(path)
174            .with_context(|| format!("Failed to create {}", path.display()))?;
175
176        // Use a large buffer (8MB) for better write performance
177        let mut writer = std::io::BufWriter::with_capacity(8 * 1024 * 1024, file);
178
179        // Calculate index offset (after header + content)
180        let index_offset = HEADER_SIZE as u64 + self.content.len() as u64;
181
182        // Write header
183        writer.write_all(MAGIC)?;
184        writer.write_all(&VERSION.to_le_bytes())?;
185        writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
186        writer.write_all(&index_offset.to_le_bytes())?;
187        writer.write_all(&[0u8; 8])?; // reserved
188
189        // Write all accumulated file contents
190        writer.write_all(&self.content)?;
191
192        // Write file index
193        for entry in &self.files {
194            let path_str = entry.path.to_string_lossy();
195            let path_bytes = path_str.as_bytes();
196
197            writer.write_all(&(path_bytes.len() as u32).to_le_bytes())?;
198            writer.write_all(path_bytes)?;
199            writer.write_all(&entry.offset.to_le_bytes())?;
200            writer.write_all(&entry.length.to_le_bytes())?;
201        }
202
203        writer.flush()?;
204        Ok(())
205    }
206
207    /// Finalize the content.bin file by writing the file index and updating the header
208    fn finalize(&mut self) -> Result<()> {
209        let mut writer = self
210            .writer
211            .take()
212            .ok_or_else(|| anyhow::anyhow!("ContentWriter not initialized"))?;
213
214        // Write file index at current position
215        let index_offset = HEADER_SIZE as u64 + self.current_offset;
216
217        for entry in &self.files {
218            let path_str = entry.path.to_string_lossy();
219            let path_bytes = path_str.as_bytes();
220
221            writer.write_all(&(path_bytes.len() as u32).to_le_bytes())?;
222            writer.write_all(path_bytes)?;
223            writer.write_all(&entry.offset.to_le_bytes())?;
224            writer.write_all(&entry.length.to_le_bytes())?;
225        }
226
227        // Consume BufWriter and get the underlying File
228        let mut file = writer
229            .into_inner()
230            .map_err(|e| anyhow::anyhow!("Failed to flush BufWriter: {}", e.error()))?;
231
232        // Rewind to header and update with correct values
233        use std::io::Seek;
234        file.seek(std::io::SeekFrom::Start(0))?;
235
236        // Write correct header
237        file.write_all(MAGIC)?;
238        file.write_all(&VERSION.to_le_bytes())?;
239        file.write_all(&(self.files.len() as u64).to_le_bytes())?;
240        file.write_all(&index_offset.to_le_bytes())?;
241        file.write_all(&[0u8; 8])?; // reserved
242
243        // Final sync to disk
244        file.sync_all()?;
245
246        log::debug!(
247            "Finalized content.bin: {} files, {} bytes of content",
248            self.files.len(),
249            self.current_offset
250        );
251
252        Ok(())
253    }
254
255    /// Get the number of files
256    pub fn file_count(&self) -> usize {
257        self.files.len()
258    }
259
260    /// Get total content size
261    pub fn content_size(&self) -> usize {
262        if self.writer.is_some() || self.file_path.is_some() {
263            // Streaming mode
264            self.current_offset as usize
265        } else {
266            // In-memory mode
267            self.content.len()
268        }
269    }
270
271    /// Finalize content store if it hasn't been finalized yet
272    ///
273    /// This is safe to call multiple times - subsequent calls are no-ops.
274    pub fn finalize_if_needed(&mut self) -> Result<()> {
275        if self.writer.is_some() {
276            self.finalize()?;
277            // Clear writer to mark as finalized
278            self.writer = None;
279        }
280        Ok(())
281    }
282}
283
284impl Default for ContentWriter {
285    fn default() -> Self {
286        Self::new()
287    }
288}
289
290/// Reader for memory-mapped content.bin
291///
292/// Provides zero-copy access to file contents.
293pub struct ContentReader {
294    _file: File,
295    mmap: Mmap,
296    files: Vec<FileEntry>,
297}
298
299impl ContentReader {
300    /// Open and memory-map content.bin
301    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
302        let path = path.as_ref();
303
304        let file =
305            File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
306
307        let mmap = unsafe {
308            Mmap::map(&file).with_context(|| format!("Failed to mmap {}", path.display()))?
309        };
310
311        // Validate header
312        if mmap.len() < HEADER_SIZE {
313            anyhow::bail!(
314                "content.bin too small (expected at least {} bytes)",
315                HEADER_SIZE
316            );
317        }
318
319        if &mmap[0..4] != MAGIC {
320            anyhow::bail!("Invalid content.bin (wrong magic bytes)");
321        }
322
323        let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
324        if version != VERSION {
325            anyhow::bail!("Unsupported content.bin version: {}", version);
326        }
327
328        let num_files = u64::from_le_bytes([
329            mmap[8], mmap[9], mmap[10], mmap[11], mmap[12], mmap[13], mmap[14], mmap[15],
330        ]);
331
332        let index_offset = u64::from_le_bytes([
333            mmap[16], mmap[17], mmap[18], mmap[19], mmap[20], mmap[21], mmap[22], mmap[23],
334        ]) as usize;
335
336        // Read file index
337        let mut files = Vec::new();
338        let mut pos = index_offset;
339
340        for i in 0..num_files {
341            if pos + 4 > mmap.len() {
342                anyhow::bail!(
343                    "Truncated file index at file {} (pos={}, mmap.len()={})",
344                    i,
345                    pos,
346                    mmap.len()
347                );
348            }
349
350            let path_len =
351                u32::from_le_bytes([mmap[pos], mmap[pos + 1], mmap[pos + 2], mmap[pos + 3]])
352                    as usize;
353            pos += 4;
354
355            if pos + path_len + 16 > mmap.len() {
356                anyhow::bail!(
357                    "Truncated file entry at file {} (pos={}, path_len={}, need={}, mmap.len()={})",
358                    i,
359                    pos,
360                    path_len,
361                    pos + path_len + 16,
362                    mmap.len()
363                );
364            }
365
366            let path_bytes = &mmap[pos..pos + path_len];
367            let path_str = std::str::from_utf8(path_bytes).context("Invalid UTF-8 in file path")?;
368            let path = PathBuf::from(path_str);
369            pos += path_len;
370
371            let offset = u64::from_le_bytes([
372                mmap[pos],
373                mmap[pos + 1],
374                mmap[pos + 2],
375                mmap[pos + 3],
376                mmap[pos + 4],
377                mmap[pos + 5],
378                mmap[pos + 6],
379                mmap[pos + 7],
380            ]);
381            pos += 8;
382
383            let length = u64::from_le_bytes([
384                mmap[pos],
385                mmap[pos + 1],
386                mmap[pos + 2],
387                mmap[pos + 3],
388                mmap[pos + 4],
389                mmap[pos + 5],
390                mmap[pos + 6],
391                mmap[pos + 7],
392            ]);
393            pos += 8;
394
395            files.push(FileEntry {
396                path,
397                offset,
398                length,
399            });
400        }
401
402        Ok(Self {
403            _file: file,
404            mmap,
405            files,
406        })
407    }
408
409    /// Get file content by file_id
410    pub fn get_file_content(&self, file_id: u32) -> Result<&str> {
411        let entry = self
412            .files
413            .get(file_id as usize)
414            .ok_or_else(|| anyhow::anyhow!("Invalid file_id: {}", file_id))?;
415
416        let start = HEADER_SIZE + entry.offset as usize;
417        let end = start + entry.length as usize;
418
419        if end > self.mmap.len() {
420            anyhow::bail!("File content out of bounds");
421        }
422
423        let bytes = &self.mmap[start..end];
424        std::str::from_utf8(bytes).context("Invalid UTF-8 in file content")
425    }
426
427    /// Get file path by file_id
428    pub fn get_file_path(&self, file_id: u32) -> Option<&Path> {
429        self.files.get(file_id as usize).map(|e| e.path.as_path())
430    }
431
432    /// Get number of files
433    pub fn file_count(&self) -> usize {
434        self.files.len()
435    }
436
437    /// Get file_id (array index) by path
438    ///
439    /// This looks up a file by its path and returns the array index, which is the
440    /// correct file_id to use with get_file_content() and other methods.
441    ///
442    /// Note: This is different from database file_ids, which are AUTO INCREMENT values.
443    pub fn get_file_id_by_path(&self, path: &str) -> Option<u32> {
444        // Normalize the input path (strip ./ prefix if present)
445        let normalized_input = path.strip_prefix("./").unwrap_or(path);
446
447        self.files
448            .iter()
449            .position(|entry| {
450                // Normalize the stored path (strip ./ prefix if present)
451                let stored_path = entry.path.to_string_lossy();
452                let normalized_stored = stored_path.strip_prefix("./").unwrap_or(&stored_path);
453                normalized_stored == normalized_input
454            })
455            .map(|idx| idx as u32)
456    }
457
458    /// Get content at a specific byte offset
459    pub fn get_content_at_offset(
460        &self,
461        file_id: u32,
462        byte_offset: u32,
463        length: usize,
464    ) -> Result<&str> {
465        let entry = self
466            .files
467            .get(file_id as usize)
468            .ok_or_else(|| anyhow::anyhow!("Invalid file_id: {}", file_id))?;
469
470        let start = HEADER_SIZE + entry.offset as usize + byte_offset as usize;
471        let end = start + length;
472
473        if end > self.mmap.len() {
474            anyhow::bail!("Content out of bounds");
475        }
476
477        let bytes = &self.mmap[start..end];
478        std::str::from_utf8(bytes).context("Invalid UTF-8 in content")
479    }
480
481    /// Get context around a byte offset (for showing match results)
482    ///
483    /// Returns (lines_before, matching_line, lines_after)
484    pub fn get_context(
485        &self,
486        file_id: u32,
487        byte_offset: u32,
488        context_lines: usize,
489    ) -> Result<(Vec<String>, String, Vec<String>)> {
490        let content = self.get_file_content(file_id)?;
491        let lines: Vec<&str> = content.lines().collect();
492
493        // Find which line contains this byte offset
494        let mut current_offset = 0;
495        let mut line_idx = 0;
496
497        for (idx, line) in lines.iter().enumerate() {
498            let line_end = current_offset + line.len() + 1; // +1 for newline
499            if byte_offset as usize >= current_offset && (byte_offset as usize) < line_end {
500                line_idx = idx;
501                break;
502            }
503            current_offset = line_end;
504        }
505
506        // Extract context
507        let start = line_idx.saturating_sub(context_lines);
508        let end = (line_idx + context_lines + 1).min(lines.len());
509
510        let before: Vec<String> = lines[start..line_idx]
511            .iter()
512            .map(|s| s.to_string())
513            .collect();
514
515        let matching = lines
516            .get(line_idx)
517            .map(|s| s.to_string())
518            .unwrap_or_default();
519
520        let after: Vec<String> = lines[line_idx + 1..end]
521            .iter()
522            .map(|s| s.to_string())
523            .collect();
524
525        Ok((before, matching, after))
526    }
527
528    /// Get context around a specific line number (1-indexed)
529    ///
530    /// Returns (lines_before, lines_after)
531    pub fn get_context_by_line(
532        &self,
533        file_id: u32,
534        line_number: usize,
535        context_lines: usize,
536    ) -> Result<(Vec<String>, Vec<String>)> {
537        let content = self.get_file_content(file_id)?;
538        let lines: Vec<&str> = content.lines().collect();
539
540        // Convert from 1-indexed to 0-indexed
541        let line_idx = line_number.saturating_sub(1);
542
543        // Extract context
544        let start = line_idx.saturating_sub(context_lines);
545        let end = (line_idx + context_lines + 1).min(lines.len());
546
547        let before: Vec<String> = lines[start..line_idx]
548            .iter()
549            .map(|s| s.to_string())
550            .collect();
551
552        let after: Vec<String> = lines[line_idx + 1..end]
553            .iter()
554            .map(|s| s.to_string())
555            .collect();
556
557        Ok((before, after))
558    }
559}
560
561#[cfg(test)]
562mod tests {
563    use super::*;
564    use tempfile::TempDir;
565
566    #[test]
567    fn test_content_writer_basic() {
568        let mut writer = ContentWriter::new();
569
570        let file1_id = writer.add_file(PathBuf::from("test1.txt"), "Hello, world!");
571        let file2_id = writer.add_file(PathBuf::from("test2.txt"), "Goodbye, world!");
572
573        assert_eq!(file1_id, 0);
574        assert_eq!(file2_id, 1);
575        assert_eq!(writer.file_count(), 2);
576    }
577
578    #[test]
579    fn test_content_roundtrip() {
580        let temp = TempDir::new().unwrap();
581        let content_path = temp.path().join("content.bin");
582
583        // Write
584        let mut writer = ContentWriter::new();
585        writer.add_file(PathBuf::from("file1.txt"), "First file content");
586        writer.add_file(PathBuf::from("file2.txt"), "Second file content");
587        writer.write(&content_path).unwrap();
588
589        // Read
590        let reader = ContentReader::open(&content_path).unwrap();
591
592        assert_eq!(reader.file_count(), 2);
593        assert_eq!(reader.get_file_content(0).unwrap(), "First file content");
594        assert_eq!(reader.get_file_content(1).unwrap(), "Second file content");
595        assert_eq!(reader.get_file_path(0).unwrap(), Path::new("file1.txt"));
596        assert_eq!(reader.get_file_path(1).unwrap(), Path::new("file2.txt"));
597    }
598
599    #[test]
600    fn test_get_context() {
601        let temp = TempDir::new().unwrap();
602        let content_path = temp.path().join("content.bin");
603
604        let mut writer = ContentWriter::new();
605        writer.add_file(
606            PathBuf::from("test.txt"),
607            "Line 1\nLine 2\nLine 3 with match\nLine 4\nLine 5",
608        );
609        writer.write(&content_path).unwrap();
610
611        let reader = ContentReader::open(&content_path).unwrap();
612
613        // Byte offset of "Line 3" (14 = "Line 1\n" + "Line 2\n")
614        let (before, matching, after) = reader.get_context(0, 14, 1).unwrap();
615
616        assert_eq!(before.len(), 1);
617        assert_eq!(before[0], "Line 2");
618        assert_eq!(matching, "Line 3 with match");
619        assert_eq!(after.len(), 1);
620        assert_eq!(after[0], "Line 4");
621    }
622
623    #[test]
624    fn test_streaming_roundtrip() {
625        let temp = TempDir::new().unwrap();
626        let content_path = temp.path().join("content.bin");
627
628        // Use the streaming path: init() -> add_file() -> finalize_if_needed()
629        let mut writer = ContentWriter::new();
630        writer.init(content_path.clone()).unwrap();
631        writer.add_file(PathBuf::from("src/main.rs"), "fn main() {}\n");
632        writer.add_file(
633            PathBuf::from("src/lib.rs"),
634            "pub fn hello() -> &'static str { \"hi\" }\n",
635        );
636        writer.finalize_if_needed().unwrap();
637
638        // Verify the file can be read back correctly
639        let reader = ContentReader::open(&content_path).unwrap();
640        assert_eq!(reader.file_count(), 2);
641        assert_eq!(reader.get_file_content(0).unwrap(), "fn main() {}\n");
642        assert_eq!(
643            reader.get_file_content(1).unwrap(),
644            "pub fn hello() -> &'static str { \"hi\" }\n"
645        );
646        assert_eq!(reader.get_file_path(0).unwrap(), Path::new("src/main.rs"));
647        assert_eq!(reader.get_file_path(1).unwrap(), Path::new("src/lib.rs"));
648    }
649
650    #[test]
651    fn test_multiline_file() {
652        let temp = TempDir::new().unwrap();
653        let content_path = temp.path().join("content.bin");
654
655        let content = "fn main() {\n    println!(\"Hello\");\n}\n";
656
657        let mut writer = ContentWriter::new();
658        writer.add_file(PathBuf::from("main.rs"), content);
659        writer.write(&content_path).unwrap();
660
661        let reader = ContentReader::open(&content_path).unwrap();
662        assert_eq!(reader.get_file_content(0).unwrap(), content);
663    }
664}