greppy/trace/
storage.rs

1//! Binary Storage with Memory Mapping
2//!
3//! Provides fast serialization and deserialization of the SemanticIndex
4//! using memory-mapped files for instant loading.
5//!
6//! File format:
7//! - Header (32 bytes): magic, version, counts, offsets
8//! - Symbols section
9//! - Tokens section
10//! - References section
11//! - Scopes section
12//! - Edges section
13//! - Files section (length-prefixed paths)
14//! - Strings section (null-terminated)
15//!
16//! @module trace/storage
17
18use std::fs::File;
19use std::io::{self, BufReader, BufWriter, Read, Write};
20use std::path::Path;
21
22use memmap2::Mmap;
23
24use super::index::{SemanticIndex, StringTable};
25use super::types::{Edge, Reference, Scope, Symbol, Token};
26use crate::core::error::{Error, Result};
27
28// =============================================================================
29// CONSTANTS
30// =============================================================================
31
32/// Magic bytes to identify greppy trace index files
33const MAGIC: [u8; 8] = *b"GRPTRACE";
34
35/// Current file format version
36const VERSION: u32 = 1;
37
38/// Header size in bytes
39const HEADER_SIZE: usize = 64;
40
41// =============================================================================
42// FILE HEADER
43// =============================================================================
44
45/// File header for the binary index format
46///
47/// Layout (64 bytes):
48/// - magic: [u8; 8] = 8 bytes
49/// - version: u32 = 4 bytes
50/// - _reserved: u32 = 4 bytes
51/// - symbol_count: u32 = 4 bytes
52/// - token_count: u32 = 4 bytes
53/// - reference_count: u32 = 4 bytes
54/// - scope_count: u32 = 4 bytes
55/// - edge_count: u32 = 4 bytes
56/// - file_count: u32 = 4 bytes
57/// - string_size: u32 = 4 bytes
58/// - _padding: [u8; 20] = 20 bytes
59/// Total: 8 + 36 + 20 = 64 bytes
60#[derive(Debug, Clone, Copy)]
61#[repr(C)]
62struct Header {
63    /// Magic bytes
64    magic: [u8; 8],
65    /// Format version
66    version: u32,
67    /// Reserved for future use
68    _reserved: u32,
69    /// Number of symbols
70    symbol_count: u32,
71    /// Number of tokens
72    token_count: u32,
73    /// Number of references
74    reference_count: u32,
75    /// Number of scopes
76    scope_count: u32,
77    /// Number of edges
78    edge_count: u32,
79    /// Number of files
80    file_count: u32,
81    /// Size of string table in bytes
82    string_size: u32,
83    /// Padding to 64 bytes
84    _padding: [u8; 20],
85}
86
87impl Header {
88    fn new(index: &SemanticIndex) -> Self {
89        Self {
90            magic: MAGIC,
91            version: VERSION,
92            _reserved: 0,
93            symbol_count: index.symbols.len() as u32,
94            token_count: index.tokens.len() as u32,
95            reference_count: index.references.len() as u32,
96            scope_count: index.scopes.len() as u32,
97            edge_count: index.edges.len() as u32,
98            file_count: index.files.len() as u32,
99            string_size: index.strings.byte_size() as u32,
100            _padding: [0; 20],
101        }
102    }
103
104    fn validate(&self) -> Result<()> {
105        if self.magic != MAGIC {
106            return Err(Error::IndexError {
107                message: "Invalid trace index file (bad magic)".into(),
108            });
109        }
110        let version = self.version;
111        if version != VERSION {
112            return Err(Error::IndexError {
113                message: format!(
114                    "Unsupported trace index version {} (expected {})",
115                    version, VERSION
116                ),
117            });
118        }
119        Ok(())
120    }
121
122    fn as_bytes(&self) -> &[u8] {
123        unsafe {
124            std::slice::from_raw_parts(
125                self as *const Self as *const u8,
126                std::mem::size_of::<Self>(),
127            )
128        }
129    }
130
131    fn from_bytes(bytes: &[u8]) -> Result<Self> {
132        if bytes.len() < std::mem::size_of::<Self>() {
133            return Err(Error::IndexError {
134                message: "Invalid trace index file (header too small)".into(),
135            });
136        }
137
138        // Safety: Header is repr(C, packed) with fixed layout
139        let header = unsafe { std::ptr::read_unaligned(bytes.as_ptr() as *const Self) };
140
141        header.validate()?;
142        Ok(header)
143    }
144}
145
146// Compile-time size check
147const _: () = {
148    assert!(std::mem::size_of::<Header>() == HEADER_SIZE);
149};
150
151// =============================================================================
152// SAVE INDEX
153// =============================================================================
154
155/// Save a SemanticIndex to a binary file
156pub fn save_index(index: &SemanticIndex, path: impl AsRef<Path>) -> Result<()> {
157    let path = path.as_ref();
158    let file = File::create(path)?;
159    let mut writer = BufWriter::with_capacity(64 * 1024, file);
160
161    // Write header
162    let header = Header::new(index);
163    writer.write_all(header.as_bytes())?;
164
165    // Write symbols
166    write_slice(&mut writer, &index.symbols)?;
167
168    // Write tokens
169    write_slice(&mut writer, &index.tokens)?;
170
171    // Write references
172    write_slice(&mut writer, &index.references)?;
173
174    // Write scopes
175    write_slice(&mut writer, &index.scopes)?;
176
177    // Write edges
178    write_slice(&mut writer, &index.edges)?;
179
180    // Write files (length-prefixed UTF-8 paths)
181    for path in &index.files {
182        let path_bytes = path.to_string_lossy().as_bytes().to_vec();
183        let len = path_bytes.len() as u32;
184        writer.write_all(&len.to_le_bytes())?;
185        writer.write_all(&path_bytes)?;
186    }
187
188    // Write string table
189    writer.write_all(index.strings.as_bytes())?;
190
191    writer.flush()?;
192    Ok(())
193}
194
195/// Write a slice of repr(C) types to the writer
196fn write_slice<T, W: Write>(writer: &mut W, slice: &[T]) -> io::Result<()> {
197    let bytes = unsafe {
198        std::slice::from_raw_parts(
199            slice.as_ptr() as *const u8,
200            slice.len() * std::mem::size_of::<T>(),
201        )
202    };
203    writer.write_all(bytes)
204}
205
206// =============================================================================
207// LOAD INDEX
208// =============================================================================
209
210/// Load a SemanticIndex from a binary file
211///
212/// This uses memory mapping for fast access to large indices.
213pub fn load_index(path: impl AsRef<Path>) -> Result<SemanticIndex> {
214    let path = path.as_ref();
215    let file = File::open(path)?;
216    let mmap = unsafe { Mmap::map(&file)? };
217
218    // Parse header
219    let header = Header::from_bytes(&mmap)?;
220    let mut offset = HEADER_SIZE;
221
222    // Read symbols
223    let symbols: Vec<Symbol> = read_vec(&mmap, &mut offset, header.symbol_count as usize)?;
224
225    // Read tokens
226    let tokens: Vec<Token> = read_vec(&mmap, &mut offset, header.token_count as usize)?;
227
228    // Read references
229    let references: Vec<Reference> = read_vec(&mmap, &mut offset, header.reference_count as usize)?;
230
231    // Read scopes
232    let scopes: Vec<Scope> = read_vec(&mmap, &mut offset, header.scope_count as usize)?;
233
234    // Read edges
235    let edges: Vec<Edge> = read_vec(&mmap, &mut offset, header.edge_count as usize)?;
236
237    // Read files
238    let mut files = Vec::with_capacity(header.file_count as usize);
239    for _ in 0..header.file_count {
240        if offset + 4 > mmap.len() {
241            return Err(Error::IndexError {
242                message: "Truncated trace index file (files section)".into(),
243            });
244        }
245        let len = u32::from_le_bytes([
246            mmap[offset],
247            mmap[offset + 1],
248            mmap[offset + 2],
249            mmap[offset + 3],
250        ]) as usize;
251        offset += 4;
252
253        if offset + len > mmap.len() {
254            return Err(Error::IndexError {
255                message: "Truncated trace index file (file path)".into(),
256            });
257        }
258        let path_str =
259            std::str::from_utf8(&mmap[offset..offset + len]).map_err(|e| Error::IndexError {
260                message: format!("Invalid UTF-8 in file path: {}", e),
261            })?;
262        files.push(path_str.into());
263        offset += len;
264    }
265
266    // Read string table
267    let string_bytes = if offset < mmap.len() {
268        mmap[offset..].to_vec()
269    } else {
270        Vec::new()
271    };
272    let strings = StringTable::from_bytes(string_bytes);
273
274    // Build the index
275    let mut index = SemanticIndex {
276        symbols,
277        tokens,
278        references,
279        scopes,
280        edges,
281        symbol_by_name: Default::default(),
282        token_by_name: Default::default(),
283        incoming_edges: Default::default(),
284        outgoing_edges: Default::default(),
285        refs_to_symbol: Default::default(),
286        files,
287        strings,
288        entry_points: Default::default(),
289    };
290
291    // Rebuild lookup structures
292    index.rebuild_lookups();
293
294    Ok(index)
295}
296
297/// Read a vector of repr(C) types from memory
298fn read_vec<T: Clone>(mmap: &Mmap, offset: &mut usize, count: usize) -> Result<Vec<T>> {
299    let size = count * std::mem::size_of::<T>();
300    if *offset + size > mmap.len() {
301        return Err(Error::IndexError {
302            message: format!(
303                "Truncated trace index file at offset {} (need {} bytes, have {})",
304                offset,
305                size,
306                mmap.len() - *offset
307            ),
308        });
309    }
310
311    let slice = &mmap[*offset..*offset + size];
312    *offset += size;
313
314    // Safety: We're reading repr(C) packed structs with known layout
315    let result = unsafe {
316        let ptr = slice.as_ptr() as *const T;
317        std::slice::from_raw_parts(ptr, count).to_vec()
318    };
319
320    Ok(result)
321}
322
323// =============================================================================
324// LOAD INDEX (STREAMING)
325// =============================================================================
326
327/// Load a SemanticIndex from a file using streaming (no mmap)
328///
329/// Use this for smaller files or when mmap is not available.
330pub fn load_index_streaming(path: impl AsRef<Path>) -> Result<SemanticIndex> {
331    let path = path.as_ref();
332    let file = File::open(path)?;
333    let mut reader = BufReader::with_capacity(64 * 1024, file);
334
335    // Read header
336    let mut header_bytes = [0u8; HEADER_SIZE];
337    reader.read_exact(&mut header_bytes)?;
338    let header = Header::from_bytes(&header_bytes)?;
339
340    // Read symbols
341    let symbols: Vec<Symbol> = read_vec_streaming(&mut reader, header.symbol_count as usize)?;
342
343    // Read tokens
344    let tokens: Vec<Token> = read_vec_streaming(&mut reader, header.token_count as usize)?;
345
346    // Read references
347    let references: Vec<Reference> =
348        read_vec_streaming(&mut reader, header.reference_count as usize)?;
349
350    // Read scopes
351    let scopes: Vec<Scope> = read_vec_streaming(&mut reader, header.scope_count as usize)?;
352
353    // Read edges
354    let edges: Vec<Edge> = read_vec_streaming(&mut reader, header.edge_count as usize)?;
355
356    // Read files
357    let mut files = Vec::with_capacity(header.file_count as usize);
358    for _ in 0..header.file_count {
359        let mut len_bytes = [0u8; 4];
360        reader.read_exact(&mut len_bytes)?;
361        let len = u32::from_le_bytes(len_bytes) as usize;
362
363        let mut path_bytes = vec![0u8; len];
364        reader.read_exact(&mut path_bytes)?;
365        let path_str = std::str::from_utf8(&path_bytes).map_err(|e| Error::IndexError {
366            message: format!("Invalid UTF-8 in file path: {}", e),
367        })?;
368        files.push(path_str.into());
369    }
370
371    // Read string table
372    let mut string_bytes = Vec::new();
373    reader.read_to_end(&mut string_bytes)?;
374    let strings = StringTable::from_bytes(string_bytes);
375
376    // Build the index
377    let mut index = SemanticIndex {
378        symbols,
379        tokens,
380        references,
381        scopes,
382        edges,
383        symbol_by_name: Default::default(),
384        token_by_name: Default::default(),
385        incoming_edges: Default::default(),
386        outgoing_edges: Default::default(),
387        refs_to_symbol: Default::default(),
388        files,
389        strings,
390        entry_points: Default::default(),
391    };
392
393    // Rebuild lookup structures
394    index.rebuild_lookups();
395
396    Ok(index)
397}
398
399/// Read a vector of repr(C) types from a reader
400fn read_vec_streaming<T: Clone, R: Read>(reader: &mut R, count: usize) -> io::Result<Vec<T>> {
401    let size = count * std::mem::size_of::<T>();
402    let mut bytes = vec![0u8; size];
403    reader.read_exact(&mut bytes)?;
404
405    // Safety: We're reading repr(C) packed structs with known layout
406    let result = unsafe {
407        let ptr = bytes.as_ptr() as *const T;
408        std::slice::from_raw_parts(ptr, count).to_vec()
409    };
410
411    Ok(result)
412}
413
414// =============================================================================
415// UTILITIES
416// =============================================================================
417
418/// Get the default trace index path for a project
419pub fn trace_index_path(project_root: impl AsRef<Path>) -> std::path::PathBuf {
420    project_root.as_ref().join(".greppy").join("trace.idx")
421}
422
423/// Check if a trace index exists for a project
424pub fn trace_index_exists(project_root: impl AsRef<Path>) -> bool {
425    trace_index_path(project_root).exists()
426}
427
428// =============================================================================
429// TESTS
430// =============================================================================
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435    use crate::trace::types::{RefKind, ScopeKind, SymbolFlags, SymbolKind, TokenKind};
436    use tempfile::tempdir;
437
438    fn create_test_index() -> SemanticIndex {
439        let mut index = SemanticIndex::new();
440
441        // Add files
442        let file_id = index.add_file("src/main.rs".into());
443
444        // Add symbols
445        let name1 = index.strings.intern("main");
446        let name2 = index.strings.intern("helper");
447
448        index.add_symbol(
449            Symbol::new(
450                0,
451                name1,
452                file_id,
453                SymbolKind::Function,
454                SymbolFlags::IS_ENTRY_POINT,
455                1,
456                10,
457            ),
458            "main",
459        );
460        index.add_symbol(
461            Symbol::new(
462                1,
463                name2,
464                file_id,
465                SymbolKind::Function,
466                SymbolFlags::empty(),
467                12,
468                20,
469            ),
470            "helper",
471        );
472
473        // Add tokens
474        index.add_token(
475            Token::new(0, name1, file_id, 1, 4, TokenKind::Identifier, 0),
476            "main",
477        );
478        index.add_token(
479            Token::new(1, name2, file_id, 5, 4, TokenKind::Call, 0),
480            "helper",
481        );
482
483        // Add references
484        index.add_reference(Reference::new(1, 1, RefKind::Call));
485
486        // Add scopes
487        index.add_scope(Scope::file_scope(0, file_id, 25));
488        index.add_scope(Scope::new(1, ScopeKind::Function, file_id, 0, 1, 10, name1));
489
490        // Add edges
491        index.add_edge(Edge::new(0, 1, 5));
492
493        index
494    }
495
496    #[test]
497    fn test_save_load_roundtrip() {
498        let dir = tempdir().unwrap();
499        let path = dir.path().join("trace.idx");
500
501        let original = create_test_index();
502        save_index(&original, &path).unwrap();
503
504        let loaded = load_index(&path).unwrap();
505
506        // Verify counts
507        assert_eq!(loaded.symbols.len(), original.symbols.len());
508        assert_eq!(loaded.tokens.len(), original.tokens.len());
509        assert_eq!(loaded.references.len(), original.references.len());
510        assert_eq!(loaded.scopes.len(), original.scopes.len());
511        assert_eq!(loaded.edges.len(), original.edges.len());
512        assert_eq!(loaded.files.len(), original.files.len());
513
514        // Verify lookups work
515        assert!(loaded.symbols_by_name("main").is_some());
516        assert!(loaded.symbols_by_name("helper").is_some());
517        assert_eq!(loaded.entry_points.len(), 1);
518
519        // Verify call graph
520        assert_eq!(loaded.callers(1), &[0]);
521        assert_eq!(loaded.callees(0), &[1]);
522    }
523
524    #[test]
525    fn test_streaming_load() {
526        let dir = tempdir().unwrap();
527        let path = dir.path().join("trace.idx");
528
529        let original = create_test_index();
530        save_index(&original, &path).unwrap();
531
532        let loaded = load_index_streaming(&path).unwrap();
533
534        assert_eq!(loaded.symbols.len(), original.symbols.len());
535        assert_eq!(loaded.tokens.len(), original.tokens.len());
536    }
537
538    #[test]
539    fn test_header_validation() {
540        // Test invalid magic
541        let bad_magic = [0u8; HEADER_SIZE];
542        assert!(Header::from_bytes(&bad_magic).is_err());
543
544        // Test valid header
545        let mut valid = [0u8; HEADER_SIZE];
546        valid[..8].copy_from_slice(&MAGIC);
547        valid[8..12].copy_from_slice(&VERSION.to_le_bytes());
548        assert!(Header::from_bytes(&valid).is_ok());
549    }
550}