memscope_rs/export/binary/
index.rs

1//! Binary file indexing system for fast allocation record lookup and filtering
2//!
3//! This module provides indexing capabilities for binary files to enable:
4//! - O(1) record location by index
5//! - Fast pre-filtering using bloom filters and range queries
6//! - Compressed index storage to minimize memory usage
7//! - Cache-friendly index structures
8
9use crate::export::binary::error::BinaryExportError;
10use crate::export::binary::format::FileHeader;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use std::path::PathBuf;
14use std::time::SystemTime;
15
16/// Main binary file index containing all metadata needed for fast access
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct BinaryIndex {
19    /// Version of the index format for compatibility
20    pub version: u32,
21
22    /// Hash of the binary file content for cache validation
23    pub file_hash: u64,
24
25    /// Path to the original binary file
26    pub file_path: PathBuf,
27
28    /// File header information
29    pub header: FileHeader,
30
31    /// String table index information
32    pub string_table: StringTableIndex,
33
34    /// Compressed allocation records index
35    pub allocations: CompactAllocationIndex,
36
37    /// Advanced metrics segment information (optional)
38    pub advanced_metrics: Option<AdvancedMetricsIndex>,
39
40    /// When this index was created
41    pub created_at: SystemTime,
42
43    /// Size of the original binary file
44    pub file_size: u64,
45}
46
47impl BinaryIndex {
48    /// Current index format version
49    pub const INDEX_VERSION: u32 = 1;
50
51    /// Create a new binary index
52    pub fn new(file_path: PathBuf, file_hash: u64, file_size: u64, header: FileHeader) -> Self {
53        Self {
54            version: Self::INDEX_VERSION,
55            file_hash,
56            file_path,
57            header,
58            string_table: StringTableIndex::default(),
59            allocations: CompactAllocationIndex::default(),
60            advanced_metrics: None,
61            created_at: SystemTime::now(),
62            file_size,
63        }
64    }
65
66    /// Check if this index is valid for the given file
67    pub fn is_valid_for_file(&self, file_path: &std::path::Path, file_hash: u64) -> bool {
68        self.file_path == file_path && self.file_hash == file_hash
69    }
70
71    /// Get the number of allocation records
72    pub fn record_count(&self) -> u32 {
73        self.allocations.count
74    }
75
76    /// Get the file offset for a specific record index
77    pub fn get_record_offset(&self, record_index: usize) -> Option<u64> {
78        if record_index >= self.allocations.count as usize {
79            return None;
80        }
81
82        let relative_offset = self.allocations.relative_offsets.get(record_index)?;
83        Some(self.allocations.records_start_offset + *relative_offset as u64)
84    }
85
86    /// Get the record size for a specific record index
87    pub fn get_record_size(&self, record_index: usize) -> Option<u16> {
88        self.allocations.record_sizes.get(record_index).copied()
89    }
90
91    /// Check if quick filtering data is available
92    pub fn has_quick_filter_data(&self) -> bool {
93        self.allocations.quick_filter_data.is_some()
94    }
95}
96
97/// String table index information
98#[derive(Debug, Clone, Serialize, Deserialize, Default)]
99pub struct StringTableIndex {
100    /// Offset of the string table in the file
101    pub offset: u64,
102
103    /// Size of the string table in bytes
104    pub size: u64,
105
106    /// Number of strings in the table
107    pub string_count: u32,
108
109    /// Whether the string table uses compression
110    pub uses_compression: bool,
111}
112
113/// Compact allocation index using relative offsets and compressed data
114#[derive(Debug, Clone, Serialize, Deserialize, Default)]
115pub struct CompactAllocationIndex {
116    /// Total number of allocation records
117    pub count: u32,
118
119    /// Absolute offset where allocation records start
120    pub records_start_offset: u64,
121
122    /// Relative offsets from records_start_offset (saves memory)
123    pub relative_offsets: Vec<u32>,
124
125    /// Size of each record in bytes (most records are < 64KB)
126    pub record_sizes: Vec<u16>,
127
128    /// Quick filtering data for large files (optional)
129    pub quick_filter_data: Option<QuickFilterData>,
130}
131
132impl CompactAllocationIndex {
133    /// Create a new compact allocation index
134    pub fn new(records_start_offset: u64) -> Self {
135        Self {
136            count: 0,
137            records_start_offset,
138            relative_offsets: Vec::new(),
139            record_sizes: Vec::new(),
140            quick_filter_data: None,
141        }
142    }
143
144    /// Add a record to the index
145    pub fn add_record(&mut self, absolute_offset: u64, size: u16) -> Result<(), BinaryExportError> {
146        if absolute_offset < self.records_start_offset {
147            return Err(BinaryExportError::CorruptedData(
148                "Record offset is before records start".to_string(),
149            ));
150        }
151
152        let relative_offset = absolute_offset - self.records_start_offset;
153        if relative_offset > u32::MAX as u64 {
154            return Err(BinaryExportError::CorruptedData(
155                "Record offset too large for relative addressing".to_string(),
156            ));
157        }
158
159        self.relative_offsets.push(relative_offset as u32);
160        self.record_sizes.push(size);
161        self.count += 1;
162
163        Ok(())
164    }
165
166    /// Reserve capacity for the expected number of records
167    pub fn reserve(&mut self, capacity: usize) {
168        self.relative_offsets.reserve(capacity);
169        self.record_sizes.reserve(capacity);
170    }
171
172    /// Get memory usage of this index in bytes
173    pub fn memory_usage(&self) -> usize {
174        std::mem::size_of::<Self>()
175            + self.relative_offsets.capacity() * std::mem::size_of::<u32>()
176            + self.record_sizes.capacity() * std::mem::size_of::<u16>()
177            + self
178                .quick_filter_data
179                .as_ref()
180                .map_or(0, |qfd| qfd.memory_usage())
181    }
182}
183
184/// Quick filtering data for large files to enable fast pre-filtering
185#[derive(Debug, Clone, Serialize, Deserialize)]
186pub struct QuickFilterData {
187    /// Batch size used for range calculations (e.g., 1000 records per batch)
188    pub batch_size: usize,
189
190    /// Pointer value ranges for each batch of records
191    pub ptr_ranges: Vec<(usize, usize)>,
192
193    /// Size ranges for each batch of records
194    pub size_ranges: Vec<(usize, usize)>,
195
196    /// Timestamp ranges for each batch of records
197    pub timestamp_ranges: Vec<(u64, u64)>,
198
199    /// Bloom filter for thread IDs (serialized as bytes)
200    pub thread_bloom_filter: Vec<u8>,
201
202    /// Bloom filter for type names (serialized as bytes)
203    pub type_bloom_filter: Vec<u8>,
204
205    /// Bloom filter parameters
206    pub bloom_filter_params: BloomFilterParams,
207}
208
209impl QuickFilterData {
210    /// Create new quick filter data with the specified batch size
211    pub fn new(batch_size: usize) -> Self {
212        Self {
213            batch_size,
214            ptr_ranges: Vec::new(),
215            size_ranges: Vec::new(),
216            timestamp_ranges: Vec::new(),
217            thread_bloom_filter: Vec::new(),
218            type_bloom_filter: Vec::new(),
219            bloom_filter_params: BloomFilterParams::default(),
220        }
221    }
222
223    /// Get memory usage of this quick filter data in bytes
224    pub fn memory_usage(&self) -> usize {
225        std::mem::size_of::<Self>()
226            + self.ptr_ranges.capacity() * std::mem::size_of::<(usize, usize)>()
227            + self.size_ranges.capacity() * std::mem::size_of::<(usize, usize)>()
228            + self.timestamp_ranges.capacity() * std::mem::size_of::<(u64, u64)>()
229            + self.thread_bloom_filter.capacity()
230            + self.type_bloom_filter.capacity()
231    }
232
233    /// Check if a pointer value might be in the specified batch
234    pub fn ptr_might_be_in_batch(&self, batch_index: usize, ptr: usize) -> bool {
235        if let Some(&(min, max)) = self.ptr_ranges.get(batch_index) {
236            ptr >= min && ptr <= max
237        } else {
238            false
239        }
240    }
241
242    /// Check if a size value might be in the specified batch
243    pub fn size_might_be_in_batch(&self, batch_index: usize, size: usize) -> bool {
244        if let Some(&(min, max)) = self.size_ranges.get(batch_index) {
245            size >= min && size <= max
246        } else {
247            false
248        }
249    }
250
251    /// Check if a timestamp might be in the specified batch
252    pub fn timestamp_might_be_in_batch(&self, batch_index: usize, timestamp: u64) -> bool {
253        if let Some(&(min, max)) = self.timestamp_ranges.get(batch_index) {
254            timestamp >= min && timestamp <= max
255        } else {
256            false
257        }
258    }
259}
260
261/// Bloom filter parameters for consistent configuration
262#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct BloomFilterParams {
264    /// Number of hash functions used
265    pub hash_functions: u32,
266
267    /// Size of the bloom filter in bits
268    pub filter_size_bits: u32,
269
270    /// Expected number of elements
271    pub expected_elements: u32,
272
273    /// Target false positive rate
274    pub false_positive_rate: f64,
275}
276
277impl Default for BloomFilterParams {
278    fn default() -> Self {
279        Self {
280            hash_functions: 3,
281            filter_size_bits: 8192, // 1KB
282            expected_elements: 1000,
283            false_positive_rate: 0.01, // 1%
284        }
285    }
286}
287
288/// Advanced metrics segment index information
289#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct AdvancedMetricsIndex {
291    /// Offset of the advanced metrics segment
292    pub offset: u64,
293
294    /// Size of the advanced metrics segment
295    pub size: u32,
296
297    /// Bitmap indicating which metrics are present
298    pub metrics_bitmap: u32,
299
300    /// Index of individual metric sections within the segment
301    pub metric_sections: HashMap<String, MetricSectionIndex>,
302}
303
304/// Index information for a specific metric section
305#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct MetricSectionIndex {
307    /// Offset within the advanced metrics segment
308    pub relative_offset: u32,
309
310    /// Size of this metric section
311    pub size: u32,
312
313    /// Number of entries in this section
314    pub entry_count: u32,
315}
316
317/// Record metadata used during index building
318#[derive(Debug, Clone)]
319pub struct RecordMetadata {
320    /// Pointer value
321    pub ptr: usize,
322
323    /// Allocation size
324    pub size: usize,
325
326    /// Allocation timestamp
327    pub timestamp: u64,
328
329    /// Thread ID (if available)
330    pub thread_id: Option<String>,
331
332    /// Type name (if available)
333    pub type_name: Option<String>,
334}
335
336impl RecordMetadata {
337    /// Create new record metadata
338    pub fn new(ptr: usize, size: usize, timestamp: u64) -> Self {
339        Self {
340            ptr,
341            size,
342            timestamp,
343            thread_id: None,
344            type_name: None,
345        }
346    }
347
348    /// Set thread ID
349    pub fn with_thread_id(mut self, thread_id: String) -> Self {
350        self.thread_id = Some(thread_id);
351        self
352    }
353
354    /// Set type name
355    pub fn with_type_name(mut self, type_name: String) -> Self {
356        self.type_name = Some(type_name);
357        self
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use std::path::Path;
365
366    #[test]
367    fn test_binary_index_creation() {
368        let file_path = PathBuf::from("test.memscope");
369        let file_hash = 0x123456789ABCDEF0;
370        let file_size = 1024;
371        let header = FileHeader::new_legacy(100);
372
373        let index = BinaryIndex::new(file_path.clone(), file_hash, file_size, header.clone());
374
375        assert_eq!(index.version, BinaryIndex::INDEX_VERSION);
376        assert_eq!(index.file_hash, file_hash);
377        assert_eq!(index.file_path, file_path);
378        assert_eq!(index.header, header);
379        assert_eq!(index.file_size, file_size);
380        assert_eq!(index.record_count(), 0);
381    }
382
383    #[test]
384    fn test_index_validation() {
385        let file_path = PathBuf::from("test.memscope");
386        let file_hash = 0x123456789ABCDEF0;
387        let header = FileHeader::new_legacy(100);
388
389        let index = BinaryIndex::new(file_path.clone(), file_hash, 1024, header);
390
391        // Valid file
392        assert!(index.is_valid_for_file(&file_path, file_hash));
393
394        // Invalid hash
395        assert!(!index.is_valid_for_file(&file_path, 0xDEADBEEF));
396
397        // Invalid path
398        assert!(!index.is_valid_for_file(Path::new("other.memscope"), file_hash));
399    }
400
401    #[test]
402    fn test_compact_allocation_index() {
403        let mut index = CompactAllocationIndex::new(1000);
404
405        // Add some records
406        assert!(index.add_record(1000, 100).is_ok());
407        assert!(index.add_record(1200, 200).is_ok());
408        assert!(index.add_record(1500, 150).is_ok());
409
410        assert_eq!(index.count, 3);
411        assert_eq!(index.relative_offsets, vec![0, 200, 500]);
412        assert_eq!(index.record_sizes, vec![100, 200, 150]);
413
414        // Test error conditions
415        assert!(index.add_record(500, 100).is_err()); // Before start offset
416    }
417
418    #[test]
419    fn test_record_offset_calculation() {
420        let file_path = PathBuf::from("test.memscope");
421        let header = FileHeader::new_legacy(3);
422        let mut index = BinaryIndex::new(file_path, 0x123, 1024, header);
423
424        // Add some records to the allocation index
425        index.allocations.records_start_offset = 1000;
426        index
427            .allocations
428            .add_record(1000, 100)
429            .expect("Test operation failed");
430        index
431            .allocations
432            .add_record(1200, 200)
433            .expect("Test operation failed");
434        index
435            .allocations
436            .add_record(1500, 150)
437            .expect("Test operation failed");
438
439        // Test offset calculation
440        assert_eq!(index.get_record_offset(0), Some(1000));
441        assert_eq!(index.get_record_offset(1), Some(1200));
442        assert_eq!(index.get_record_offset(2), Some(1500));
443        assert_eq!(index.get_record_offset(3), None); // Out of bounds
444
445        // Test size retrieval
446        assert_eq!(index.get_record_size(0), Some(100));
447        assert_eq!(index.get_record_size(1), Some(200));
448        assert_eq!(index.get_record_size(2), Some(150));
449        assert_eq!(index.get_record_size(3), None); // Out of bounds
450    }
451
452    #[test]
453    fn test_quick_filter_data() {
454        let mut qfd = QuickFilterData::new(1000);
455
456        // Add some range data
457        qfd.ptr_ranges.push((0x1000, 0x2000));
458        qfd.ptr_ranges.push((0x3000, 0x4000));
459        qfd.size_ranges.push((100, 500));
460        qfd.size_ranges.push((600, 1000));
461        qfd.timestamp_ranges.push((1000, 2000));
462        qfd.timestamp_ranges.push((3000, 4000));
463
464        // Test range checks
465        assert!(qfd.ptr_might_be_in_batch(0, 0x1500));
466        assert!(!qfd.ptr_might_be_in_batch(0, 0x2500));
467        assert!(qfd.ptr_might_be_in_batch(1, 0x3500));
468
469        assert!(qfd.size_might_be_in_batch(0, 300));
470        assert!(!qfd.size_might_be_in_batch(0, 550));
471        assert!(qfd.size_might_be_in_batch(1, 800));
472
473        assert!(qfd.timestamp_might_be_in_batch(0, 1500));
474        assert!(!qfd.timestamp_might_be_in_batch(0, 2500));
475        assert!(qfd.timestamp_might_be_in_batch(1, 3500));
476
477        // Test out of bounds
478        assert!(!qfd.ptr_might_be_in_batch(2, 0x1500));
479    }
480
481    #[test]
482    fn test_record_metadata() {
483        let metadata = RecordMetadata::new(0x1000, 1024, 1234567890)
484            .with_thread_id("main".to_string())
485            .with_type_name("Vec<u8>".to_string());
486
487        assert_eq!(metadata.ptr, 0x1000);
488        assert_eq!(metadata.size, 1024);
489        assert_eq!(metadata.timestamp, 1234567890);
490        assert_eq!(metadata.thread_id, Some("main".to_string()));
491        assert_eq!(metadata.type_name, Some("Vec<u8>".to_string()));
492    }
493
494    #[test]
495    fn test_memory_usage_calculation() {
496        let mut index = CompactAllocationIndex::new(1000);
497
498        // Add some records
499        for i in 0..100 {
500            index
501                .add_record(1000 + i * 100, 100)
502                .expect("Test operation failed");
503        }
504
505        let memory_usage = index.memory_usage();
506
507        // Should include base struct size plus vector capacities
508        let expected_min = std::mem::size_of::<CompactAllocationIndex>()
509            + 100 * std::mem::size_of::<u32>()
510            + 100 * std::mem::size_of::<u16>();
511
512        assert!(memory_usage >= expected_min);
513    }
514}