Skip to main content

memscope_rs/analysis/heap_scanner/
reader.rs

1//! Safe heap memory reader with page-wise validation.
2//!
3//! Provides the `HeapScanner` which reads allocation memory content
4//! while preventing segfaults through ValidRegions checks and
5//! atomic system calls for fault tolerance.
6
7use crate::analysis::is_virtual_pointer;
8use crate::analysis::unsafe_inference::is_valid_ptr;
9use crate::snapshot::types::ActiveAllocation;
10
11/// Maximum bytes to read per allocation. Metadata headers are always
12/// within the first few dozen bytes; 4KB is more than sufficient.
13const MAX_READ_BYTES: usize = 4096;
14
15/// Page size for memory validation checks.
16const PAGE_SIZE: usize = 4096;
17
18/// Result of scanning a single allocation.
19#[derive(Debug)]
20pub struct ScanResult {
21    /// Pointer address of the allocation.
22    pub ptr: usize,
23    /// Original allocation size.
24    pub size: usize,
25    /// Memory content that was successfully read (capped at MAX_READ_BYTES).
26    pub memory: Option<Vec<u8>>,
27}
28
29/// HeapScanner reads heap memory for active allocations during snapshot analysis.
30///
31/// All memory reads are validated through `ValidRegions` to prevent segfaults.
32/// This module only operates during offline analysis and has zero runtime overhead.
33pub struct HeapScanner;
34
35impl HeapScanner {
36    /// Scan a list of active allocations, reading their memory content.
37    ///
38    /// Only scans HeapOwner allocations and performs deduplication
39    /// to avoid redundant scanning of duplicate heap regions.
40    ///
41    /// Returns a `ScanResult` for each unique heap region. Allocations whose
42    /// pointers fall outside valid regions will have `memory: None`.
43    ///
44    /// # Arguments
45    ///
46    /// * `allocations` - List of active allocations to scan.
47    ///
48    /// # Example
49    ///
50    /// ```ignore
51    /// let results = HeapScanner::scan(&allocations);
52    /// for result in results {
53    ///     if let Some(ref memory) = result.memory {
54    ///         let view = MemoryView::new(memory);
55    ///         // ... pass to UTI Engine
56    ///     }
57    /// }
58    /// ```
59    pub fn scan(allocations: &[ActiveAllocation]) -> Vec<ScanResult> {
60        // Step 1: Filter HeapOwner + deduplicate regions
61        let regions = Self::dedup_heap_regions(allocations);
62
63        // Step 2: Scan deduplicated regions
64        regions
65            .iter()
66            .map(|&(ptr, size)| {
67                let memory = safe_read_memory(ptr, size);
68                ScanResult { ptr, size, memory }
69            })
70            .collect()
71    }
72
73    /// Deduplicate heap regions to avoid redundant scanning.
74    ///
75    /// Filters for HeapOwner allocations and removes duplicates
76    /// based on (ptr, size) pairs.
77    ///
78    /// Also skips virtual pointers (>= 0x10000000000) used for Container types.
79    /// This threshold is set high enough to avoid conflicts with real heap addresses
80    /// on all platforms (including macOS which can have addresses > 0x100000000).
81    fn dedup_heap_regions(allocs: &[ActiveAllocation]) -> Vec<(usize, usize)> {
82        use std::collections::HashSet;
83
84        let mut seen = HashSet::new();
85        let mut regions = Vec::new();
86
87        for alloc in allocs {
88            if let crate::core::types::TrackKind::HeapOwner { ptr, size } = alloc.kind {
89                if is_virtual_pointer(ptr) {
90                    continue;
91                }
92
93                let key = (ptr, size);
94
95                if seen.insert(key) {
96                    regions.push(key);
97                }
98            }
99        }
100
101        regions
102    }
103}
104
105/// Safely read memory at `ptr` for up to `size` bytes.
106///
107/// Returns `None` if the address is not in valid regions, or if any
108/// page within the read range is unmapped.
109///
110/// On Linux, uses `process_vm_readv` which is an atomic syscall that cannot
111/// be interrupted by signals mid-read, eliminating TOCTOU issues.
112///
113/// On other platforms, falls back to volatile byte-by-byte reads with
114/// pre-validation of all pages before reading begins.
115fn safe_read_memory(ptr: usize, size: usize) -> Option<Vec<u8>> {
116    if size == 0 || ptr == 0 {
117        return None;
118    }
119
120    if !is_valid_ptr(ptr) {
121        return None;
122    }
123
124    let read_size = size.min(MAX_READ_BYTES);
125    if !are_pages_valid(ptr, read_size) {
126        return None;
127    }
128
129    let mut buf = vec![0u8; read_size];
130
131    #[cfg(target_os = "linux")]
132    {
133        if safe_read_linux(ptr, &mut buf) {
134            Some(buf)
135        } else {
136            None
137        }
138    }
139
140    #[cfg(not(target_os = "linux"))]
141    {
142        if read_bytes_volatile(ptr, &mut buf) {
143            Some(buf)
144        } else {
145            None
146        }
147    }
148}
149
150#[cfg(target_os = "linux")]
151mod linux_read {
152    use libc::{iovec, process_vm_readv};
153
154    /// Read memory from the current process using process_vm_readv.
155    ///
156    /// This uses pid=0 which refers to the calling process itself.
157    /// According to Linux man page: "The caller must have the CAP_SYS_PTRACE
158    /// capability, OR the real, effective, and saved-set user ID of the caller
159    /// must match the real user ID of the target process."
160    ///
161    /// For pid=0 (current process), the user IDs always match, so no special
162    /// privileges are required. This works in most environments including
163    /// containers (unless seccomp filters explicitly block process_vm_readv).
164    ///
165    /// The function is named `_local` to clarify it's for self-reading,
166    /// not for reading remote processes.
167    pub fn safe_read_linux_local(
168        remote_ptr: *const libc::c_void,
169        local_ptr: *mut libc::c_void,
170        len: usize,
171    ) -> isize {
172        let local_iov = iovec {
173            iov_base: local_ptr,
174            iov_len: len,
175        };
176        let remote_iov = iovec {
177            iov_base: remote_ptr as *mut libc::c_void,
178            iov_len: len,
179        };
180
181        // pid=0 means the calling process reads its own memory
182        // No CAP_SYS_PTRACE required for self-reading
183        unsafe { process_vm_readv(0, &local_iov, 1, &remote_iov, 1, 0) }
184    }
185}
186
187#[cfg(target_os = "linux")]
188fn safe_read_linux(ptr: usize, buf: &mut [u8]) -> bool {
189    use linux_read::safe_read_linux_local;
190
191    let len = buf.len();
192
193    let result = safe_read_linux_local(
194        ptr as *const libc::c_void,
195        buf.as_mut_ptr() as *mut libc::c_void,
196        len,
197    );
198
199    result == len as isize
200}
201
202#[cfg(not(target_os = "linux"))]
203#[allow(dead_code)] // Stub for non-Linux platforms; used when building on macOS/Windows
204fn safe_read_linux(_ptr: usize, _buf: &mut [u8]) -> bool {
205    false
206}
207
208#[cfg(not(target_os = "linux"))]
209fn read_bytes_volatile(ptr: usize, buf: &mut [u8]) -> bool {
210    // Pre-check: verify the entire range is valid before reading
211    if !are_pages_valid(ptr, buf.len()) {
212        return false;
213    }
214
215    // Use a safer approach: try-catch with signal handling would be ideal,
216    // but Rust doesn't have that. Instead, we rely on pre-validation.
217    // On macOS, direct volatile reads should work if pages are valid.
218    unsafe {
219        let src = ptr as *const u8;
220        for (i, byte) in buf.iter_mut().enumerate() {
221            *byte = std::ptr::read_volatile(src.add(i));
222        }
223    }
224    true
225}
226
227/// Check that every page in [ptr, ptr + size) is in a valid region.
228fn are_pages_valid(ptr: usize, size: usize) -> bool {
229    let page_start = ptr & !(PAGE_SIZE - 1);
230    let page_end = (ptr + size + PAGE_SIZE - 1) & !(PAGE_SIZE - 1);
231
232    let mut p = page_start;
233    while p < page_end {
234        if !is_valid_ptr(p) {
235            return false;
236        }
237        p += PAGE_SIZE;
238    }
239    true
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use crate::core::types::TrackKind;
246
247    #[test]
248    fn test_safe_read_memory_zero_size() {
249        assert!(safe_read_memory(0x10000, 0).is_none());
250    }
251
252    #[test]
253    fn test_safe_read_memory_null_ptr() {
254        assert!(safe_read_memory(0, 100).is_none());
255    }
256
257    #[test]
258    #[cfg(target_os = "macos")]
259    fn test_are_pages_valid_single_page() {
260        assert!(are_pages_valid(0x10000, 100));
261    }
262
263    #[test]
264    #[cfg(target_os = "macos")]
265    fn test_are_pages_valid_cross_page() {
266        // Use a heap-like address that should be valid on all platforms
267        let ptr = 0x10000;
268        let size = 200;
269        assert!(are_pages_valid(ptr, size));
270    }
271
272    #[test]
273    fn test_scan_result_creation() {
274        let result = ScanResult {
275            ptr: 0x1000,
276            size: 64,
277            memory: None,
278        };
279        assert_eq!(result.ptr, 0x1000);
280        assert_eq!(result.size, 64);
281        assert!(result.memory.is_none());
282    }
283
284    #[test]
285    #[cfg(not(target_os = "linux"))]
286    fn test_heap_scanner_scan_real_allocations() {
287        let data1 = vec![42u8; 64];
288        let data2 = vec![99u8; 128];
289        let ptr1 = data1.as_ptr() as usize;
290        let ptr2 = data2.as_ptr() as usize;
291
292        let allocations = vec![
293            ActiveAllocation {
294                ptr: Some(ptr1),
295                size: 64,
296                kind: TrackKind::HeapOwner {
297                    ptr: ptr1,
298                    size: 64,
299                },
300                allocated_at: 1000,
301                var_name: None,
302                type_name: None,
303                thread_id: 0,
304                call_stack_hash: None,
305                module_path: None,
306                stack_ptr: None,
307            },
308            ActiveAllocation {
309                ptr: Some(ptr2),
310                size: 128,
311                kind: TrackKind::HeapOwner {
312                    ptr: ptr2,
313                    size: 128,
314                },
315                allocated_at: 2000,
316                var_name: None,
317                type_name: None,
318                thread_id: 0,
319                call_stack_hash: None,
320                module_path: None,
321                stack_ptr: None,
322            },
323        ];
324
325        let results = HeapScanner::scan(&allocations);
326        assert_eq!(results.len(), 2);
327
328        assert!(results[0].memory.is_some(), "Should read memory at ptr1");
329        assert!(results[1].memory.is_some(), "Should read memory at ptr2");
330
331        drop(data1);
332        drop(data2);
333    }
334
335    #[test]
336    fn test_heap_scanner_scan_empty_allocations() {
337        let allocations: Vec<ActiveAllocation> = vec![];
338        let results = HeapScanner::scan(&allocations);
339        assert!(results.is_empty());
340    }
341
342    #[test]
343    fn test_heap_scanner_scan_zero_size_allocation() {
344        let allocations = vec![ActiveAllocation {
345            ptr: Some(0x10000),
346            size: 0,
347            kind: TrackKind::HeapOwner {
348                ptr: 0x10000,
349                size: 0,
350            },
351            allocated_at: 1000,
352            var_name: None,
353            type_name: None,
354            thread_id: 0,
355            call_stack_hash: None,
356            module_path: None,
357            stack_ptr: None,
358        }];
359
360        let results = HeapScanner::scan(&allocations);
361        assert_eq!(results.len(), 1);
362        // Zero-size allocation should return None for memory (nothing to read).
363        assert!(results[0].memory.is_none());
364    }
365
366    #[test]
367    #[cfg(not(target_os = "linux"))]
368    #[ignore = "Heap pointer addresses may exceed VIRTUAL_PTR_BASE in some CI environments"]
369    fn test_heap_scanner_content_preserved_after_scan() {
370        let data = vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE, 0xBA, 0xBE];
371        let ptr = data.as_ptr() as usize;
372        let size = data.len();
373
374        let alloc = ActiveAllocation {
375            ptr: Some(ptr),
376            size,
377            kind: TrackKind::HeapOwner { ptr, size },
378            allocated_at: 1000,
379            var_name: None,
380            type_name: None,
381            thread_id: 0,
382            call_stack_hash: None,
383            module_path: None,
384            stack_ptr: None,
385        };
386
387        let results = HeapScanner::scan(&[alloc]);
388        assert_eq!(results.len(), 1);
389
390        let mem = results[0]
391            .memory
392            .as_ref()
393            .expect("Should read memory at allocated address");
394        assert_eq!(mem.len(), size, "Should read expected number of bytes");
395
396        drop(data);
397    }
398}