memscope_rs/export/binary/
index.rs

1//! Binary file indexing system for fast allocation record lookup and filtering
2//!
3//! This module provides indexing capabilities for binary files to enable:
4//! - O(1) record location by index
5//! - Fast pre-filtering using bloom filters and range queries
6//! - Compressed index storage to minimize memory usage
7//! - Cache-friendly index structures
8
9use crate::export::binary::error::BinaryExportError;
10use crate::export::binary::format::FileHeader;
11use bincode::{Decode, Encode};
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14use std::path::PathBuf;
15use std::time::SystemTime;
16
17/// Main binary file index containing all metadata needed for fast access
18#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
19pub struct BinaryIndex {
20    /// Version of the index format for compatibility
21    pub version: u32,
22
23    /// Hash of the binary file content for cache validation
24    pub file_hash: u64,
25
26    /// Path to the original binary file
27    pub file_path: PathBuf,
28
29    /// File header information
30    pub header: FileHeader,
31
32    /// String table index information
33    pub string_table: StringTableIndex,
34
35    /// Compressed allocation records index
36    pub allocations: CompactAllocationIndex,
37
38    /// Advanced metrics segment information (optional)
39    pub advanced_metrics: Option<AdvancedMetricsIndex>,
40
41    /// When this index was created
42    pub created_at: SystemTime,
43
44    /// Size of the original binary file
45    pub file_size: u64,
46}
47
48impl BinaryIndex {
49    /// Current index format version
50    pub const INDEX_VERSION: u32 = 1;
51
52    /// Create a new binary index
53    pub fn new(file_path: PathBuf, file_hash: u64, file_size: u64, header: FileHeader) -> Self {
54        Self {
55            version: Self::INDEX_VERSION,
56            file_hash,
57            file_path,
58            header,
59            string_table: StringTableIndex::default(),
60            allocations: CompactAllocationIndex::default(),
61            advanced_metrics: None,
62            created_at: SystemTime::now(),
63            file_size,
64        }
65    }
66
67    /// Check if this index is valid for the given file
68    pub fn is_valid_for_file(&self, file_path: &std::path::Path, file_hash: u64) -> bool {
69        self.file_path == file_path && self.file_hash == file_hash
70    }
71
72    /// Get the number of allocation records
73    pub fn record_count(&self) -> u32 {
74        self.allocations.count
75    }
76
77    /// Get the file offset for a specific record index
78    pub fn get_record_offset(&self, record_index: usize) -> Option<u64> {
79        if record_index >= self.allocations.count as usize {
80            return None;
81        }
82
83        let relative_offset = self.allocations.relative_offsets.get(record_index)?;
84        Some(self.allocations.records_start_offset + *relative_offset as u64)
85    }
86
87    /// Get the record size for a specific record index
88    pub fn get_record_size(&self, record_index: usize) -> Option<u16> {
89        self.allocations.record_sizes.get(record_index).copied()
90    }
91
92    /// Check if quick filtering data is available
93    pub fn has_quick_filter_data(&self) -> bool {
94        self.allocations.quick_filter_data.is_some()
95    }
96}
97
98/// String table index information
99#[derive(Debug, Clone, Serialize, Deserialize, Default, Encode, Decode)]
100pub struct StringTableIndex {
101    /// Offset of the string table in the file
102    pub offset: u64,
103
104    /// Size of the string table in bytes
105    pub size: u64,
106
107    /// Number of strings in the table
108    pub string_count: u32,
109
110    /// Whether the string table uses compression
111    pub uses_compression: bool,
112}
113
114/// Compact allocation index using relative offsets and compressed data
115#[derive(Debug, Clone, Serialize, Deserialize, Default, Encode, Decode)]
116pub struct CompactAllocationIndex {
117    /// Total number of allocation records
118    pub count: u32,
119
120    /// Absolute offset where allocation records start
121    pub records_start_offset: u64,
122
123    /// Relative offsets from records_start_offset (saves memory)
124    pub relative_offsets: Vec<u32>,
125
126    /// Size of each record in bytes (most records are < 64KB)
127    pub record_sizes: Vec<u16>,
128
129    /// Quick filtering data for large files (optional)
130    pub quick_filter_data: Option<QuickFilterData>,
131}
132
133impl CompactAllocationIndex {
134    /// Create a new compact allocation index
135    pub fn new(records_start_offset: u64) -> Self {
136        Self {
137            count: 0,
138            records_start_offset,
139            relative_offsets: Vec::new(),
140            record_sizes: Vec::new(),
141            quick_filter_data: None,
142        }
143    }
144
145    /// Add a record to the index
146    pub fn add_record(&mut self, absolute_offset: u64, size: u16) -> Result<(), BinaryExportError> {
147        if absolute_offset < self.records_start_offset {
148            return Err(BinaryExportError::CorruptedData(
149                "Record offset is before records start".to_string(),
150            ));
151        }
152
153        let relative_offset = absolute_offset - self.records_start_offset;
154        if relative_offset > u32::MAX as u64 {
155            return Err(BinaryExportError::CorruptedData(
156                "Record offset too large for relative addressing".to_string(),
157            ));
158        }
159
160        self.relative_offsets.push(relative_offset as u32);
161        self.record_sizes.push(size);
162        self.count += 1;
163
164        Ok(())
165    }
166
167    /// Reserve capacity for the expected number of records
168    pub fn reserve(&mut self, capacity: usize) {
169        self.relative_offsets.reserve(capacity);
170        self.record_sizes.reserve(capacity);
171    }
172
173    /// Get memory usage of this index in bytes
174    pub fn memory_usage(&self) -> usize {
175        std::mem::size_of::<Self>()
176            + self.relative_offsets.capacity() * std::mem::size_of::<u32>()
177            + self.record_sizes.capacity() * std::mem::size_of::<u16>()
178            + self
179                .quick_filter_data
180                .as_ref()
181                .map_or(0, |qfd| qfd.memory_usage())
182    }
183}
184
185/// Quick filtering data for large files to enable fast pre-filtering
186#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
187pub struct QuickFilterData {
188    /// Batch size used for range calculations (e.g., 1000 records per batch)
189    pub batch_size: usize,
190
191    /// Pointer value ranges for each batch of records
192    pub ptr_ranges: Vec<(usize, usize)>,
193
194    /// Size ranges for each batch of records
195    pub size_ranges: Vec<(usize, usize)>,
196
197    /// Timestamp ranges for each batch of records
198    pub timestamp_ranges: Vec<(u64, u64)>,
199
200    /// Bloom filter for thread IDs (serialized as bytes)
201    pub thread_bloom_filter: Vec<u8>,
202
203    /// Bloom filter for type names (serialized as bytes)
204    pub type_bloom_filter: Vec<u8>,
205
206    /// Bloom filter parameters
207    pub bloom_filter_params: BloomFilterParams,
208}
209
210impl QuickFilterData {
211    /// Create new quick filter data with the specified batch size
212    pub fn new(batch_size: usize) -> Self {
213        Self {
214            batch_size,
215            ptr_ranges: Vec::new(),
216            size_ranges: Vec::new(),
217            timestamp_ranges: Vec::new(),
218            thread_bloom_filter: Vec::new(),
219            type_bloom_filter: Vec::new(),
220            bloom_filter_params: BloomFilterParams::default(),
221        }
222    }
223
224    /// Get memory usage of this quick filter data in bytes
225    pub fn memory_usage(&self) -> usize {
226        std::mem::size_of::<Self>()
227            + self.ptr_ranges.capacity() * std::mem::size_of::<(usize, usize)>()
228            + self.size_ranges.capacity() * std::mem::size_of::<(usize, usize)>()
229            + self.timestamp_ranges.capacity() * std::mem::size_of::<(u64, u64)>()
230            + self.thread_bloom_filter.capacity()
231            + self.type_bloom_filter.capacity()
232    }
233
234    /// Check if a pointer value might be in the specified batch
235    pub fn ptr_might_be_in_batch(&self, batch_index: usize, ptr: usize) -> bool {
236        if let Some(&(min, max)) = self.ptr_ranges.get(batch_index) {
237            ptr >= min && ptr <= max
238        } else {
239            false
240        }
241    }
242
243    /// Check if a size value might be in the specified batch
244    pub fn size_might_be_in_batch(&self, batch_index: usize, size: usize) -> bool {
245        if let Some(&(min, max)) = self.size_ranges.get(batch_index) {
246            size >= min && size <= max
247        } else {
248            false
249        }
250    }
251
252    /// Check if a timestamp might be in the specified batch
253    pub fn timestamp_might_be_in_batch(&self, batch_index: usize, timestamp: u64) -> bool {
254        if let Some(&(min, max)) = self.timestamp_ranges.get(batch_index) {
255            timestamp >= min && timestamp <= max
256        } else {
257            false
258        }
259    }
260}
261
262/// Bloom filter parameters for consistent configuration
263#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
264pub struct BloomFilterParams {
265    /// Number of hash functions used
266    pub hash_functions: u32,
267
268    /// Size of the bloom filter in bits
269    pub filter_size_bits: u32,
270
271    /// Expected number of elements
272    pub expected_elements: u32,
273
274    /// Target false positive rate
275    pub false_positive_rate: f64,
276}
277
278impl Default for BloomFilterParams {
279    fn default() -> Self {
280        Self {
281            hash_functions: 3,
282            filter_size_bits: 8192, // 1KB
283            expected_elements: 1000,
284            false_positive_rate: 0.01, // 1%
285        }
286    }
287}
288
289/// Advanced metrics segment index information
290#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
291pub struct AdvancedMetricsIndex {
292    /// Offset of the advanced metrics segment
293    pub offset: u64,
294
295    /// Size of the advanced metrics segment
296    pub size: u32,
297
298    /// Bitmap indicating which metrics are present
299    pub metrics_bitmap: u32,
300
301    /// Index of individual metric sections within the segment
302    pub metric_sections: HashMap<String, MetricSectionIndex>,
303}
304
305/// Index information for a specific metric section
306#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
307pub struct MetricSectionIndex {
308    /// Offset within the advanced metrics segment
309    pub relative_offset: u32,
310
311    /// Size of this metric section
312    pub size: u32,
313
314    /// Number of entries in this section
315    pub entry_count: u32,
316}
317
318/// Record metadata used during index building
319#[derive(Debug, Clone)]
320pub struct RecordMetadata {
321    /// Pointer value
322    pub ptr: usize,
323
324    /// Allocation size
325    pub size: usize,
326
327    /// Allocation timestamp
328    pub timestamp: u64,
329
330    /// Thread ID (if available)
331    pub thread_id: Option<String>,
332
333    /// Type name (if available)
334    pub type_name: Option<String>,
335}
336
337impl RecordMetadata {
338    /// Create new record metadata
339    pub fn new(ptr: usize, size: usize, timestamp: u64) -> Self {
340        Self {
341            ptr,
342            size,
343            timestamp,
344            thread_id: None,
345            type_name: None,
346        }
347    }
348
349    /// Set thread ID
350    pub fn with_thread_id(mut self, thread_id: String) -> Self {
351        self.thread_id = Some(thread_id);
352        self
353    }
354
355    /// Set type name
356    pub fn with_type_name(mut self, type_name: String) -> Self {
357        self.type_name = Some(type_name);
358        self
359    }
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365    use std::path::Path;
366
367    #[test]
368    fn test_binary_index_creation() {
369        let file_path = PathBuf::from("test.memscope");
370        let file_hash = 0x123456789ABCDEF0;
371        let file_size = 1024;
372        let header = FileHeader::new_legacy(100);
373
374        let index = BinaryIndex::new(file_path.clone(), file_hash, file_size, header.clone());
375
376        assert_eq!(index.version, BinaryIndex::INDEX_VERSION);
377        assert_eq!(index.file_hash, file_hash);
378        assert_eq!(index.file_path, file_path);
379        assert_eq!(index.header, header);
380        assert_eq!(index.file_size, file_size);
381        assert_eq!(index.record_count(), 0);
382    }
383
384    #[test]
385    fn test_index_validation() {
386        let file_path = PathBuf::from("test.memscope");
387        let file_hash = 0x123456789ABCDEF0;
388        let header = FileHeader::new_legacy(100);
389
390        let index = BinaryIndex::new(file_path.clone(), file_hash, 1024, header);
391
392        // Valid file
393        assert!(index.is_valid_for_file(&file_path, file_hash));
394
395        // Invalid hash
396        assert!(!index.is_valid_for_file(&file_path, 0xDEADBEEF));
397
398        // Invalid path
399        assert!(!index.is_valid_for_file(Path::new("other.memscope"), file_hash));
400    }
401
402    #[test]
403    fn test_compact_allocation_index() {
404        let mut index = CompactAllocationIndex::new(1000);
405
406        // Add some records
407        assert!(index.add_record(1000, 100).is_ok());
408        assert!(index.add_record(1200, 200).is_ok());
409        assert!(index.add_record(1500, 150).is_ok());
410
411        assert_eq!(index.count, 3);
412        assert_eq!(index.relative_offsets, vec![0, 200, 500]);
413        assert_eq!(index.record_sizes, vec![100, 200, 150]);
414
415        // Test error conditions
416        assert!(index.add_record(500, 100).is_err()); // Before start offset
417    }
418
419    #[test]
420    fn test_record_offset_calculation() {
421        let file_path = PathBuf::from("test.memscope");
422        let header = FileHeader::new_legacy(3);
423        let mut index = BinaryIndex::new(file_path, 0x123, 1024, header);
424
425        // Add some records to the allocation index
426        index.allocations.records_start_offset = 1000;
427        index
428            .allocations
429            .add_record(1000, 100)
430            .expect("Test operation failed");
431        index
432            .allocations
433            .add_record(1200, 200)
434            .expect("Test operation failed");
435        index
436            .allocations
437            .add_record(1500, 150)
438            .expect("Test operation failed");
439
440        // Test offset calculation
441        assert_eq!(index.get_record_offset(0), Some(1000));
442        assert_eq!(index.get_record_offset(1), Some(1200));
443        assert_eq!(index.get_record_offset(2), Some(1500));
444        assert_eq!(index.get_record_offset(3), None); // Out of bounds
445
446        // Test size retrieval
447        assert_eq!(index.get_record_size(0), Some(100));
448        assert_eq!(index.get_record_size(1), Some(200));
449        assert_eq!(index.get_record_size(2), Some(150));
450        assert_eq!(index.get_record_size(3), None); // Out of bounds
451    }
452
453    #[test]
454    fn test_quick_filter_data() {
455        let mut qfd = QuickFilterData::new(1000);
456
457        // Add some range data
458        qfd.ptr_ranges.push((0x1000, 0x2000));
459        qfd.ptr_ranges.push((0x3000, 0x4000));
460        qfd.size_ranges.push((100, 500));
461        qfd.size_ranges.push((600, 1000));
462        qfd.timestamp_ranges.push((1000, 2000));
463        qfd.timestamp_ranges.push((3000, 4000));
464
465        // Test range checks
466        assert!(qfd.ptr_might_be_in_batch(0, 0x1500));
467        assert!(!qfd.ptr_might_be_in_batch(0, 0x2500));
468        assert!(qfd.ptr_might_be_in_batch(1, 0x3500));
469
470        assert!(qfd.size_might_be_in_batch(0, 300));
471        assert!(!qfd.size_might_be_in_batch(0, 550));
472        assert!(qfd.size_might_be_in_batch(1, 800));
473
474        assert!(qfd.timestamp_might_be_in_batch(0, 1500));
475        assert!(!qfd.timestamp_might_be_in_batch(0, 2500));
476        assert!(qfd.timestamp_might_be_in_batch(1, 3500));
477
478        // Test out of bounds
479        assert!(!qfd.ptr_might_be_in_batch(2, 0x1500));
480    }
481
482    #[test]
483    fn test_record_metadata() {
484        let metadata = RecordMetadata::new(0x1000, 1024, 1234567890)
485            .with_thread_id("main".to_string())
486            .with_type_name("Vec<u8>".to_string());
487
488        assert_eq!(metadata.ptr, 0x1000);
489        assert_eq!(metadata.size, 1024);
490        assert_eq!(metadata.timestamp, 1234567890);
491        assert_eq!(metadata.thread_id, Some("main".to_string()));
492        assert_eq!(metadata.type_name, Some("Vec<u8>".to_string()));
493    }
494
495    #[test]
496    fn test_memory_usage_calculation() {
497        let mut index = CompactAllocationIndex::new(1000);
498
499        // Add some records
500        for i in 0..100 {
501            index
502                .add_record(1000 + i * 100, 100)
503                .expect("Test operation failed");
504        }
505
506        let memory_usage = index.memory_usage();
507
508        // Should include base struct size plus vector capacities
509        let expected_min = std::mem::size_of::<CompactAllocationIndex>()
510            + 100 * std::mem::size_of::<u32>()
511            + 100 * std::mem::size_of::<u16>();
512
513        assert!(memory_usage >= expected_min);
514    }
515}