memscope_rs/export/binary/
index_builder.rs

1//! Binary index builder for creating optimized indexes from binary files
2//!
3//! This module provides functionality to scan binary files and build indexes
4//! that enable fast record lookup and filtering operations.
5
6use crate::export::binary::error::BinaryExportError;
7use crate::export::binary::format::{FileHeader, ALLOCATION_RECORD_TYPE, HEADER_SIZE};
8use crate::export::binary::index::{
9    BinaryIndex, BloomFilterParams, CompactAllocationIndex, QuickFilterData, RecordMetadata,
10    StringTableIndex,
11};
12use crate::export::binary::serializable::primitives;
13use std::collections::hash_map::DefaultHasher;
14use std::fs::File;
15use std::hash::{Hash, Hasher};
16use std::io::{BufReader, Read, Seek, SeekFrom};
17use std::path::Path;
18
19/// Builder for creating binary file indexes
20pub struct BinaryIndexBuilder {
21    /// Threshold for enabling quick filter data (number of records)
22    quick_filter_threshold: u32,
23
24    /// Batch size for quick filter data
25    quick_filter_batch_size: usize,
26
27    /// Bloom filter parameters
28    bloom_filter_params: BloomFilterParams,
29}
30
31impl BinaryIndexBuilder {
32    /// Create a new index builder with default settings
33    pub fn new() -> Self {
34        Self {
35            quick_filter_threshold: 1000,
36            quick_filter_batch_size: 1000,
37            bloom_filter_params: BloomFilterParams::default(),
38        }
39    }
40
41    /// Set the threshold for enabling quick filter data
42    pub fn with_quick_filter_threshold(mut self, threshold: u32) -> Self {
43        self.quick_filter_threshold = threshold;
44        self
45    }
46
47    /// Set the batch size for quick filter data
48    pub fn with_quick_filter_batch_size(mut self, batch_size: usize) -> Self {
49        self.quick_filter_batch_size = batch_size;
50        self
51    }
52
53    /// Set bloom filter parameters
54    pub fn with_bloom_filter_params(mut self, params: BloomFilterParams) -> Self {
55        self.bloom_filter_params = params;
56        self
57    }
58
59    /// Build an index for the specified binary file
60    pub fn build_index<P: AsRef<Path>>(
61        &self,
62        binary_path: P,
63    ) -> Result<BinaryIndex, BinaryExportError> {
64        let path = binary_path.as_ref();
65        let file = File::open(path)?;
66        let file_size = file.metadata()?.len();
67        let file_hash = self.compute_file_hash(path)?;
68
69        let mut reader = BufReader::new(file);
70
71        // Read and validate header
72        let header = self.read_header(&mut reader)?;
73
74        // Read string table information
75        let string_table_info = self.read_string_table_info(&mut reader)?;
76
77        // Scan allocation records and build index
78        let allocation_index = self.scan_allocation_records(&mut reader, &header)?;
79
80        // Create the index
81        let mut index = BinaryIndex::new(path.to_path_buf(), file_hash, file_size, header);
82
83        index.string_table = string_table_info;
84        index.allocations = allocation_index;
85
86        Ok(index)
87    }
88
89    /// Compute hash of the file content for cache validation
90    fn compute_file_hash<P: AsRef<Path>>(&self, path: P) -> Result<u64, BinaryExportError> {
91        let mut file = File::open(path)?;
92        let mut hasher = DefaultHasher::new();
93
94        // Hash file metadata
95        let metadata = file.metadata()?;
96        metadata.len().hash(&mut hasher);
97        if let Ok(modified) = metadata.modified() {
98            if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) {
99                duration.as_secs().hash(&mut hasher);
100            }
101        }
102
103        // Hash first 1KB and last 1KB of file for quick validation
104        let mut buffer = [0u8; 1024];
105        let bytes_read = file.read(&mut buffer)?;
106        buffer[..bytes_read].hash(&mut hasher);
107
108        // Hash last 1KB if file is large enough
109        if metadata.len() > 2048 {
110            file.seek(SeekFrom::End(-1024))?;
111            let bytes_read = file.read(&mut buffer)?;
112            buffer[..bytes_read].hash(&mut hasher);
113        }
114
115        Ok(hasher.finish())
116    }
117
118    /// Read and validate file header
119    fn read_header(&self, reader: &mut BufReader<File>) -> Result<FileHeader, BinaryExportError> {
120        let mut header_bytes = [0u8; HEADER_SIZE];
121        reader.read_exact(&mut header_bytes)?;
122
123        let header = FileHeader::from_bytes(&header_bytes);
124
125        // Validate magic bytes
126        if !header.is_valid_magic() {
127            return Err(BinaryExportError::InvalidMagic {
128                expected: "MEMSCOPE".to_string(),
129                actual: String::from_utf8_lossy(&header.magic).to_string(),
130            });
131        }
132
133        // Check version compatibility
134        if !header.is_compatible_version() {
135            return Err(BinaryExportError::UnsupportedVersion(header.version));
136        }
137
138        Ok(header)
139    }
140
141    /// Read string table information without loading the actual strings
142    fn read_string_table_info(
143        &self,
144        reader: &mut BufReader<File>,
145    ) -> Result<StringTableIndex, BinaryExportError> {
146        let table_start_offset = reader.stream_position()?;
147
148        // Read string table marker (4 bytes)
149        let mut marker = [0u8; 4];
150        reader.read_exact(&mut marker)?;
151
152        // Read table size (4 bytes)
153        let table_size = primitives::read_u32(reader)?;
154
155        if &marker == b"STBL" && table_size > 0 {
156            // Read compression flag
157            let uses_compression = primitives::read_u8(reader)? != 0;
158
159            // Read string count
160            let string_count = primitives::read_u32(reader)?;
161
162            // Skip the actual string data
163            reader.seek(SeekFrom::Current(table_size as i64 - 9))?; // 9 = 4 + 1 + 4 (size + compression + count)
164
165            Ok(StringTableIndex {
166                offset: table_start_offset,
167                size: table_size as u64 + 8, // Include marker and size fields
168                string_count,
169                uses_compression,
170            })
171        } else if &marker == b"NONE" {
172            // table_size was already read above, no need to read it again
173            Ok(StringTableIndex::default())
174        } else {
175            Err(BinaryExportError::CorruptedData(
176                "Invalid string table marker".to_string(),
177            ))
178        }
179    }
180
181    /// Scan allocation records and build the allocation index
182    fn scan_allocation_records(
183        &self,
184        reader: &mut BufReader<File>,
185        header: &FileHeader,
186    ) -> Result<CompactAllocationIndex, BinaryExportError> {
187        let records_start_offset = reader.stream_position()?;
188        let mut allocation_index = CompactAllocationIndex::new(records_start_offset);
189        allocation_index.reserve(header.total_count as usize);
190
191        let mut record_metadata = Vec::new();
192        let mut quick_filter_builder = if header.total_count >= self.quick_filter_threshold {
193            Some(QuickFilterBuilder::new(
194                self.quick_filter_batch_size,
195                self.bloom_filter_params.clone(),
196            ))
197        } else {
198            None
199        };
200
201        // Get file size for bounds checking
202        let file_size = reader
203            .get_ref()
204            .metadata()
205            .map_err(|e| {
206                BinaryExportError::CorruptedData(format!("Failed to get file metadata: {e:?}"))
207            })?
208            .len();
209
210        // Scan each allocation record with error recovery
211        for i in 0..header.total_count {
212            let record_start_offset = reader.stream_position()?;
213
214            // Check if we're near end of file
215            if record_start_offset >= file_size {
216                return Err(BinaryExportError::CorruptedData(format!(
217                    "Reached end of file while reading record {} of {} at offset {}",
218                    i + 1,
219                    header.total_count,
220                    record_start_offset
221                )));
222            }
223
224            // Read record metadata with error recovery
225            let metadata = match self.read_record_metadata(reader) {
226                Ok(metadata) => metadata,
227                Err(e) => {
228                    tracing::warn!(
229                        "Failed to read record {} at offset {}: {e}",
230                        i + 1,
231                        record_start_offset,
232                    );
233
234                    // Try to recover by skipping this record
235                    // This is better than failing completely
236                    continue;
237                }
238            };
239
240            let record_end_offset = reader.stream_position()?;
241
242            // Validate record size is reasonable
243            let record_size_u64 = record_end_offset - record_start_offset;
244            if record_size_u64 > u16::MAX as u64 {
245                tracing::warn!(
246                    "Record {} has unusually large size: {} bytes, skipping",
247                    i + 1,
248                    record_size_u64
249                );
250                continue;
251            }
252
253            let record_size = record_size_u64 as u16;
254
255            // Add to allocation index
256            if let Err(e) = allocation_index.add_record(record_start_offset, record_size) {
257                tracing::warn!("Failed to add record {} to index: {}", i + 1, e);
258                continue;
259            }
260
261            // Add to quick filter builder if enabled
262            if let Some(ref mut builder) = quick_filter_builder {
263                builder.add_record(i as usize, &metadata);
264            }
265
266            record_metadata.push(metadata);
267        }
268
269        // Build quick filter data if we have enough records
270        if let Some(builder) = quick_filter_builder {
271            allocation_index.quick_filter_data = Some(builder.build());
272        }
273
274        tracing::info!(
275            "Successfully indexed {} allocation records",
276            record_metadata.len()
277        );
278
279        Ok(allocation_index)
280    }
281
282    /// Read record metadata without fully parsing the allocation record
283    fn read_record_metadata(
284        &self,
285        reader: &mut BufReader<File>,
286    ) -> Result<RecordMetadata, BinaryExportError> {
287        let current_position = reader.stream_position().unwrap_or(0);
288
289        // Read Type (1 byte)
290        let mut type_byte = [0u8; 1];
291        reader.read_exact(&mut type_byte).map_err(|e| {
292            BinaryExportError::CorruptedData(format!(
293                "Failed to read record type at position {current_position}: {e:?}",
294            ))
295        })?;
296
297        if type_byte[0] != ALLOCATION_RECORD_TYPE {
298            return Err(BinaryExportError::CorruptedData(format!(
299                "Invalid record type: {} at position {current_position}",
300                type_byte[0]
301            )));
302        }
303
304        // Read Length (4 bytes)
305        let mut length_bytes = [0u8; 4];
306        reader.read_exact(&mut length_bytes).map_err(|e| {
307            BinaryExportError::CorruptedData(format!(
308                "Failed to read record length at position {}: {e:?}",
309                current_position + 1,
310            ))
311        })?;
312        let record_length = u32::from_le_bytes(length_bytes);
313
314        // Validate record length is reasonable
315        if record_length == 0 || record_length > 1024 * 1024 {
316            return Err(BinaryExportError::CorruptedData(format!(
317                "Invalid record length: {record_length} at position {current_position}",
318            )));
319        }
320
321        // Read basic fields that we need for indexing
322        let ptr = primitives::read_u64(reader).map_err(|e| {
323            BinaryExportError::CorruptedData(format!(
324                "Failed to read ptr at position {}: {e:?}",
325                current_position + 5,
326            ))
327        })? as usize;
328
329        let size = primitives::read_u64(reader).map_err(|e| {
330            BinaryExportError::CorruptedData(format!(
331                "Failed to read size at position {}: {e:?}",
332                current_position + 13,
333            ))
334        })? as usize;
335
336        let timestamp = primitives::read_u64(reader).map_err(|e| {
337            BinaryExportError::CorruptedData(format!(
338                "Failed to read timestamp at position {}: {e:?}",
339                current_position + 21,
340            ))
341        })?;
342
343        // Calculate remaining bytes correctly
344        // record_length includes the data after Type+Length fields
345        // We've read: ptr(8) + size(8) + timestamp(8) = 24 bytes of data
346        let bytes_read_from_data = 24u32;
347
348        if record_length < bytes_read_from_data {
349            return Err(BinaryExportError::CorruptedData(format!(
350                "Record length {record_length} is smaller than minimum required {bytes_read_from_data} at position {current_position}",
351            )));
352        }
353
354        let remaining_bytes = record_length - bytes_read_from_data;
355
356        // Skip the rest of the record safely
357        if remaining_bytes > 0 {
358            reader
359                .seek(SeekFrom::Current(remaining_bytes as i64))
360                .map_err(|e| {
361                    BinaryExportError::CorruptedData(format!(
362                        "Failed to skip {remaining_bytes} remaining bytes at position {}: {e:?}",
363                        current_position + 29,
364                    ))
365                })?;
366        }
367
368        Ok(RecordMetadata::new(ptr, size, timestamp))
369    }
370}
371
372impl Default for BinaryIndexBuilder {
373    fn default() -> Self {
374        Self::new()
375    }
376}
377
378/// Builder for quick filter data
379struct QuickFilterBuilder {
380    batch_size: usize,
381    bloom_filter_params: BloomFilterParams,
382    current_batch: Vec<RecordMetadata>,
383    ptr_ranges: Vec<(usize, usize)>,
384    size_ranges: Vec<(usize, usize)>,
385    timestamp_ranges: Vec<(u64, u64)>,
386    thread_ids: Vec<String>,
387    type_names: Vec<String>,
388}
389
390impl QuickFilterBuilder {
391    fn new(batch_size: usize, bloom_filter_params: BloomFilterParams) -> Self {
392        Self {
393            batch_size,
394            bloom_filter_params,
395            current_batch: Vec::with_capacity(batch_size),
396            ptr_ranges: Vec::new(),
397            size_ranges: Vec::new(),
398            timestamp_ranges: Vec::new(),
399            thread_ids: Vec::new(),
400            type_names: Vec::new(),
401        }
402    }
403
404    fn add_record(&mut self, _record_index: usize, metadata: &RecordMetadata) {
405        self.current_batch.push(metadata.clone());
406
407        // Collect thread IDs and type names for bloom filters
408        if let Some(ref thread_id) = metadata.thread_id {
409            self.thread_ids.push(thread_id.clone());
410        }
411        if let Some(ref type_name) = metadata.type_name {
412            self.type_names.push(type_name.clone());
413        }
414
415        // Process batch when full
416        if self.current_batch.len() >= self.batch_size {
417            self.process_batch();
418        }
419    }
420
421    fn process_batch(&mut self) {
422        if self.current_batch.is_empty() {
423            return;
424        }
425
426        // Calculate ranges for this batch
427        let ptr_min = self.current_batch.iter().map(|r| r.ptr).min().unwrap_or(0);
428        let ptr_max = self.current_batch.iter().map(|r| r.ptr).max().unwrap_or(0);
429        self.ptr_ranges.push((ptr_min, ptr_max));
430
431        let size_min = self.current_batch.iter().map(|r| r.size).min().unwrap_or(0);
432        let size_max = self.current_batch.iter().map(|r| r.size).max().unwrap_or(0);
433        self.size_ranges.push((size_min, size_max));
434
435        let timestamp_min = self
436            .current_batch
437            .iter()
438            .map(|r| r.timestamp)
439            .min()
440            .unwrap_or(0);
441        let timestamp_max = self
442            .current_batch
443            .iter()
444            .map(|r| r.timestamp)
445            .max()
446            .unwrap_or(0);
447        self.timestamp_ranges.push((timestamp_min, timestamp_max));
448
449        self.current_batch.clear();
450    }
451
452    fn build(mut self) -> QuickFilterData {
453        // Process any remaining records
454        self.process_batch();
455
456        // Build bloom filters (simplified implementation)
457        let thread_bloom_filter = self.build_simple_bloom_filter(&self.thread_ids);
458        let type_bloom_filter = self.build_simple_bloom_filter(&self.type_names);
459
460        QuickFilterData {
461            batch_size: self.batch_size,
462            ptr_ranges: self.ptr_ranges,
463            size_ranges: self.size_ranges,
464            timestamp_ranges: self.timestamp_ranges,
465            thread_bloom_filter,
466            type_bloom_filter,
467            bloom_filter_params: self.bloom_filter_params,
468        }
469    }
470
471    /// Build a simple bloom filter from strings (simplified implementation)
472    fn build_simple_bloom_filter(&self, strings: &[String]) -> Vec<u8> {
473        // This is a simplified bloom filter implementation
474        // In a real implementation, you would use a proper bloom filter library
475        let filter_size = (self.bloom_filter_params.filter_size_bits / 8) as usize;
476        let mut filter = vec![0u8; filter_size];
477
478        for string in strings {
479            let mut hasher = DefaultHasher::new();
480            string.hash(&mut hasher);
481            let hash = hasher.finish();
482
483            // Set multiple bits based on hash functions
484            for i in 0..self.bloom_filter_params.hash_functions {
485                let bit_index = ((hash.wrapping_add(i as u64)) % (filter_size * 8) as u64) as usize;
486                let byte_index = bit_index / 8;
487                let bit_offset = bit_index % 8;
488
489                if byte_index < filter.len() {
490                    filter[byte_index] |= 1 << bit_offset;
491                }
492            }
493        }
494
495        filter
496    }
497}
498
499#[cfg(test)]
500mod tests {
501    use super::*;
502    use crate::core::types::AllocationInfo;
503    use crate::export::binary::writer::BinaryWriter;
504    use tempfile::NamedTempFile;
505
506    fn create_test_allocation() -> AllocationInfo {
507        AllocationInfo {
508            ptr: 0x1000,
509            size: 1024,
510            var_name: Some("test_var".to_string()),
511            type_name: Some("i32".to_string()),
512            scope_name: None,
513            timestamp_alloc: 1234567890,
514            timestamp_dealloc: None,
515            thread_id: "main".to_string(),
516            borrow_count: 0,
517            stack_trace: None,
518            is_leaked: false,
519            lifetime_ms: None,
520            borrow_info: None,
521            clone_info: None,
522            ownership_history_available: false,
523            smart_pointer_info: None,
524            memory_layout: None,
525            generic_info: None,
526            dynamic_type_info: None,
527            runtime_state: None,
528            stack_allocation: None,
529            temporary_object: None,
530            fragmentation_analysis: None,
531            generic_instantiation: None,
532            type_relationships: None,
533            type_usage: None,
534            function_call_tracking: None,
535            lifecycle_tracking: None,
536            access_tracking: None,
537            drop_chain_analysis: None,
538        }
539    }
540
541    #[test]
542    fn test_index_builder_creation() {
543        let builder = BinaryIndexBuilder::new();
544        assert_eq!(builder.quick_filter_threshold, 1000);
545        assert_eq!(builder.quick_filter_batch_size, 1000);
546    }
547
548    #[test]
549    fn test_index_builder_configuration() {
550        let builder = BinaryIndexBuilder::new()
551            .with_quick_filter_threshold(500)
552            .with_quick_filter_batch_size(200);
553
554        assert_eq!(builder.quick_filter_threshold, 500);
555        assert_eq!(builder.quick_filter_batch_size, 200);
556    }
557
558    #[test]
559    fn test_build_index_from_binary_file() {
560        let temp_file = NamedTempFile::new().expect("Failed to create temp file");
561        let test_allocations = vec![create_test_allocation(), {
562            let mut alloc = create_test_allocation();
563            alloc.ptr = 0x2000;
564            alloc.size = 2048;
565            alloc.timestamp_alloc = 1234567891;
566            alloc
567        }];
568
569        // Write test data to binary file
570        {
571            let mut writer =
572                BinaryWriter::new(temp_file.path()).expect("Failed to create temp file");
573            writer
574                .write_header(test_allocations.len() as u32)
575                .expect("Failed to write header");
576            for alloc in &test_allocations {
577                writer
578                    .write_allocation(alloc)
579                    .expect("Failed to write allocation");
580            }
581            writer.finish().expect("Failed to finish writing");
582        }
583
584        // Build index from the binary file
585        let builder = BinaryIndexBuilder::new();
586        let index = builder
587            .build_index(temp_file.path())
588            .expect("Failed to create temp file");
589
590        // Verify index properties
591        assert_eq!(index.record_count(), 2);
592        assert_eq!(index.header.total_count, 2);
593        assert!(index.file_size > 0);
594        assert!(index.file_hash != 0);
595
596        // Verify record offsets
597        assert!(index.get_record_offset(0).is_some());
598        assert!(index.get_record_offset(1).is_some());
599        assert!(index.get_record_offset(2).is_none());
600
601        // Verify record sizes
602        assert!(index.get_record_size(0).is_some());
603        assert!(index.get_record_size(1).is_some());
604        assert!(index.get_record_size(2).is_none());
605    }
606
607    #[test]
608    fn test_file_hash_computation() {
609        let temp_file = NamedTempFile::new().expect("Failed to create temp file");
610
611        // Write some test data
612        std::fs::write(temp_file.path(), b"test data for hashing")
613            .expect("Failed to create temp file");
614
615        let builder = BinaryIndexBuilder::new();
616        let hash1 = builder
617            .compute_file_hash(temp_file.path())
618            .expect("Failed to create temp file");
619        let hash2 = builder
620            .compute_file_hash(temp_file.path())
621            .expect("Failed to create temp file");
622
623        // Same file should produce same hash
624        assert_eq!(hash1, hash2);
625
626        // Different content should produce different hash
627        std::fs::write(temp_file.path(), b"different test data")
628            .expect("Failed to create temp file");
629        let hash3 = builder
630            .compute_file_hash(temp_file.path())
631            .expect("Failed to create temp file");
632        assert_ne!(hash1, hash3);
633    }
634
635    #[test]
636    fn test_quick_filter_builder() {
637        let mut builder = QuickFilterBuilder::new(2, BloomFilterParams::default());
638
639        let metadata1 = RecordMetadata::new(0x1000, 1024, 1000)
640            .with_thread_id("main".to_string())
641            .with_type_name("Vec<u8>".to_string());
642
643        let metadata2 = RecordMetadata::new(0x2000, 2048, 2000)
644            .with_thread_id("worker".to_string())
645            .with_type_name("String".to_string());
646
647        builder.add_record(0, &metadata1);
648        builder.add_record(1, &metadata2);
649
650        let quick_filter = builder.build();
651
652        assert_eq!(quick_filter.batch_size, 2);
653        assert_eq!(quick_filter.ptr_ranges.len(), 1);
654        assert_eq!(quick_filter.size_ranges.len(), 1);
655        assert_eq!(quick_filter.timestamp_ranges.len(), 1);
656
657        // Check ranges
658        assert_eq!(quick_filter.ptr_ranges[0], (0x1000, 0x2000));
659        assert_eq!(quick_filter.size_ranges[0], (1024, 2048));
660        assert_eq!(quick_filter.timestamp_ranges[0], (1000, 2000));
661
662        // Check bloom filters are created
663        assert!(!quick_filter.thread_bloom_filter.is_empty());
664        assert!(!quick_filter.type_bloom_filter.is_empty());
665    }
666}