Skip to main content

memscope_rs/analysis/heap_scanner/
reader.rs

1//! Safe heap memory reader with page-wise validation.
2//!
3//! Provides the `HeapScanner` which reads allocation memory content
4//! while preventing segfaults through ValidRegions checks and
5//! atomic system calls for fault tolerance.
6
7use crate::analysis::is_virtual_pointer;
8use crate::analysis::unsafe_inference::is_valid_ptr;
9use crate::snapshot::types::ActiveAllocation;
10
11/// Maximum bytes to read per allocation. Metadata headers are always
12/// within the first few dozen bytes; 4KB is more than sufficient.
13const MAX_READ_BYTES: usize = 4096;
14
15/// Page size for memory validation checks.
16const PAGE_SIZE: usize = 4096;
17
18/// Result of scanning a single allocation.
19#[derive(Debug)]
20pub struct ScanResult {
21    /// Pointer address of the allocation.
22    pub ptr: usize,
23    /// Original allocation size.
24    pub size: usize,
25    /// Memory content that was successfully read (capped at MAX_READ_BYTES).
26    pub memory: Option<Vec<u8>>,
27}
28
29/// HeapScanner reads heap memory for active allocations during snapshot analysis.
30///
31/// All memory reads are validated through `ValidRegions` to prevent segfaults.
32/// This module only operates during offline analysis and has zero runtime overhead.
33pub struct HeapScanner;
34
35impl HeapScanner {
36    /// Scan a list of active allocations, reading their memory content.
37    ///
38    /// Only scans HeapOwner allocations and performs deduplication
39    /// to avoid redundant scanning of duplicate heap regions.
40    ///
41    /// Returns a `ScanResult` for each unique heap region. Allocations whose
42    /// pointers fall outside valid regions will have `memory: None`.
43    ///
44    /// # Arguments
45    ///
46    /// * `allocations` - List of active allocations to scan.
47    ///
48    /// # Example
49    ///
50    /// ```ignore
51    /// let results = HeapScanner::scan(&allocations);
52    /// for result in results {
53    ///     if let Some(ref memory) = result.memory {
54    ///         let view = MemoryView::new(memory);
55    ///         // ... pass to UTI Engine
56    ///     }
57    /// }
58    /// ```
59    pub fn scan(allocations: &[ActiveAllocation]) -> Vec<ScanResult> {
60        // Step 1: Filter HeapOwner + deduplicate regions
61        let regions = Self::dedup_heap_regions(allocations);
62
63        // Step 2: Scan deduplicated regions
64        regions
65            .iter()
66            .map(|&(ptr, size)| {
67                let memory = safe_read_memory(ptr, size);
68                ScanResult { ptr, size, memory }
69            })
70            .collect()
71    }
72
73    /// Deduplicate heap regions to avoid redundant scanning.
74    ///
75    /// Filters for HeapOwner allocations and removes duplicates
76    /// based on (ptr, size) pairs.
77    ///
78    /// Also skips virtual pointers (>= 0x10000000000) used for Container types.
79    /// This threshold is set high enough to avoid conflicts with real heap addresses
80    /// on all platforms (including macOS which can have addresses > 0x100000000).
81    fn dedup_heap_regions(allocs: &[ActiveAllocation]) -> Vec<(usize, usize)> {
82        use std::collections::HashSet;
83
84        let mut seen = HashSet::new();
85        let mut regions = Vec::new();
86
87        for alloc in allocs {
88            if let crate::core::types::TrackKind::HeapOwner { ptr, size } = alloc.kind {
89                if is_virtual_pointer(ptr) {
90                    continue;
91                }
92
93                let key = (ptr, size);
94
95                if seen.insert(key) {
96                    regions.push(key);
97                }
98            }
99        }
100
101        regions
102    }
103}
104
105/// Safely read memory at `ptr` for up to `size` bytes.
106///
107/// Returns `None` if the address is not in valid regions, or if any
108/// page within the read range is unmapped.
109///
110/// On Linux, uses `process_vm_readv` which is an atomic syscall that cannot
111/// be interrupted by signals mid-read, eliminating TOCTOU issues.
112///
113/// On other platforms, falls back to volatile byte-by-byte reads with
114/// pre-validation of all pages before reading begins.
115fn safe_read_memory(ptr: usize, size: usize) -> Option<Vec<u8>> {
116    if size == 0 || ptr == 0 {
117        return None;
118    }
119
120    if !is_valid_ptr(ptr) {
121        return None;
122    }
123
124    let read_size = size.min(MAX_READ_BYTES);
125    if !are_pages_valid(ptr, read_size) {
126        return None;
127    }
128
129    let mut buf = vec![0u8; read_size];
130
131    #[cfg(target_os = "linux")]
132    {
133        if safe_read_linux(ptr, &mut buf) {
134            Some(buf)
135        } else {
136            None
137        }
138    }
139
140    #[cfg(not(target_os = "linux"))]
141    {
142        if read_bytes_volatile(ptr, &mut buf) {
143            Some(buf)
144        } else {
145            None
146        }
147    }
148}
149
150#[cfg(target_os = "linux")]
151mod linux_read {
152    use libc::{iovec, process_vm_readv};
153
154    /// Read memory from the current process using process_vm_readv.
155    ///
156    /// This uses pid=0 which refers to the calling process itself.
157    /// According to Linux man page: "The caller must have the CAP_SYS_PTRACE
158    /// capability, OR the real, effective, and saved-set user ID of the caller
159    /// must match the real user ID of the target process."
160    ///
161    /// For pid=0 (current process), the user IDs always match, so no special
162    /// privileges are required. This works in most environments including
163    /// containers (unless seccomp filters explicitly block process_vm_readv).
164    ///
165    /// The function is named `_local` to clarify it's for self-reading,
166    /// not for reading remote processes.
167    pub fn safe_read_linux_local(
168        remote_ptr: *const libc::c_void,
169        local_ptr: *mut libc::c_void,
170        len: usize,
171    ) -> isize {
172        let local_iov = iovec {
173            iov_base: local_ptr,
174            iov_len: len,
175        };
176        let remote_iov = iovec {
177            iov_base: remote_ptr as *mut libc::c_void,
178            iov_len: len,
179        };
180
181        // pid=0 means the calling process reads its own memory
182        // No CAP_SYS_PTRACE required for self-reading
183        unsafe { process_vm_readv(0, &local_iov, 1, &remote_iov, 1, 0) }
184    }
185}
186
187#[cfg(target_os = "linux")]
188fn safe_read_linux(ptr: usize, buf: &mut [u8]) -> bool {
189    use linux_read::safe_read_linux_local;
190
191    let len = buf.len();
192
193    let result = safe_read_linux_local(
194        ptr as *const libc::c_void,
195        buf.as_mut_ptr() as *mut libc::c_void,
196        len,
197    );
198
199    result == len as isize
200}
201
202#[cfg(not(target_os = "linux"))]
203#[allow(dead_code)] // Stub for non-Linux platforms; used when building on macOS/Windows
204fn safe_read_linux(_ptr: usize, _buf: &mut [u8]) -> bool {
205    false
206}
207
208#[cfg(not(target_os = "linux"))]
209fn read_bytes_volatile(ptr: usize, buf: &mut [u8]) -> bool {
210    // Pre-check: verify the entire range is valid before reading
211    if !are_pages_valid(ptr, buf.len()) {
212        return false;
213    }
214
215    // Use a safer approach: try-catch with signal handling would be ideal,
216    // but Rust doesn't have that. Instead, we rely on pre-validation.
217    // On macOS, direct volatile reads should work if pages are valid.
218    unsafe {
219        let src = ptr as *const u8;
220        for (i, byte) in buf.iter_mut().enumerate() {
221            *byte = std::ptr::read_volatile(src.add(i));
222        }
223    }
224    true
225}
226
227/// Check that every page in [ptr, ptr + size) is in a valid region.
228fn are_pages_valid(ptr: usize, size: usize) -> bool {
229    let page_start = ptr & !(PAGE_SIZE - 1);
230    let page_end = (ptr + size + PAGE_SIZE - 1) & !(PAGE_SIZE - 1);
231
232    let mut p = page_start;
233    while p < page_end {
234        if !is_valid_ptr(p) {
235            return false;
236        }
237        p += PAGE_SIZE;
238    }
239    true
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use crate::core::types::TrackKind;
246
247    #[test]
248    fn test_safe_read_memory_zero_size() {
249        assert!(safe_read_memory(0x10000, 0).is_none());
250    }
251
252    #[test]
253    fn test_safe_read_memory_null_ptr() {
254        assert!(safe_read_memory(0, 100).is_none());
255    }
256
257    #[test]
258    #[cfg(target_os = "macos")]
259    fn test_are_pages_valid_single_page() {
260        assert!(are_pages_valid(0x10000, 100));
261    }
262
263    #[test]
264    #[cfg(target_os = "macos")]
265    fn test_are_pages_valid_cross_page() {
266        // Use a heap-like address that should be valid on all platforms
267        let ptr = 0x10000;
268        let size = 200;
269        assert!(are_pages_valid(ptr, size));
270    }
271
272    #[test]
273    fn test_scan_result_creation() {
274        let result = ScanResult {
275            ptr: 0x1000,
276            size: 64,
277            memory: None,
278        };
279        assert_eq!(result.ptr, 0x1000);
280        assert_eq!(result.size, 64);
281        assert!(result.memory.is_none());
282    }
283
284    #[test]
285    #[cfg(not(target_os = "linux"))]
286    fn test_heap_scanner_scan_real_allocations() {
287        let data1 = vec![42u8; 64];
288        let data2 = vec![99u8; 128];
289        let ptr1 = data1.as_ptr() as usize;
290        let ptr2 = data2.as_ptr() as usize;
291
292        let allocations = vec![
293            ActiveAllocation {
294                ptr: Some(ptr1),
295                size: 64,
296                kind: TrackKind::HeapOwner {
297                    ptr: ptr1,
298                    size: 64,
299                },
300                allocated_at: 1000,
301                var_name: None,
302                type_name: None,
303                thread_id: 0,
304                call_stack_hash: None,
305            },
306            ActiveAllocation {
307                ptr: Some(ptr2),
308                size: 128,
309                kind: TrackKind::HeapOwner {
310                    ptr: ptr2,
311                    size: 128,
312                },
313                allocated_at: 2000,
314                var_name: None,
315                type_name: None,
316                thread_id: 0,
317                call_stack_hash: None,
318            },
319        ];
320
321        let results = HeapScanner::scan(&allocations);
322        assert_eq!(results.len(), 2);
323
324        assert!(results[0].memory.is_some(), "Should read memory at ptr1");
325        assert!(results[1].memory.is_some(), "Should read memory at ptr2");
326
327        drop(data1);
328        drop(data2);
329    }
330
331    #[test]
332    fn test_heap_scanner_scan_empty_allocations() {
333        let allocations: Vec<ActiveAllocation> = vec![];
334        let results = HeapScanner::scan(&allocations);
335        assert!(results.is_empty());
336    }
337
338    #[test]
339    fn test_heap_scanner_scan_zero_size_allocation() {
340        let allocations = vec![ActiveAllocation {
341            ptr: Some(0x10000),
342            size: 0,
343            kind: TrackKind::HeapOwner {
344                ptr: 0x10000,
345                size: 0,
346            },
347            allocated_at: 1000,
348            var_name: None,
349            type_name: None,
350            thread_id: 0,
351            call_stack_hash: None,
352        }];
353
354        let results = HeapScanner::scan(&allocations);
355        assert_eq!(results.len(), 1);
356        // Zero-size allocation should return None for memory (nothing to read).
357        assert!(results[0].memory.is_none());
358    }
359
360    #[test]
361    #[cfg(not(target_os = "linux"))]
362    fn test_heap_scanner_content_preserved_after_scan() {
363        let data = vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE, 0xBA, 0xBE];
364        let ptr = data.as_ptr() as usize;
365        let size = data.len();
366
367        let alloc = ActiveAllocation {
368            ptr: Some(ptr),
369            size,
370            kind: TrackKind::HeapOwner { ptr, size },
371            allocated_at: 1000,
372            var_name: None,
373            type_name: None,
374            thread_id: 0,
375            call_stack_hash: None,
376        };
377
378        let results = HeapScanner::scan(&[alloc]);
379        assert_eq!(results.len(), 1);
380
381        let mem = results[0]
382            .memory
383            .as_ref()
384            .expect("Should read memory at allocated address");
385        assert_eq!(mem.len(), size, "Should read expected number of bytes");
386
387        drop(data);
388    }
389}