use core::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
const MAX_STACK_DEPTH: usize = 64;
const CALLSITE_CAPACITY: usize = 8192;
const ALLOC_TABLE_CAPACITY: usize = 256 * 1024;
const TOMBSTONE: u64 = u64::MAX;
const SHM_PATH: &[u8] = b"/rsprof-trace\0";
const MAGIC: u64 = 0x5253_5052_4F46_5333;
const VERSION: u32 = 3;
#[repr(C)]
pub struct CallsiteStats {
pub hash: AtomicU64,
pub alloc_count: AtomicU64,
pub alloc_bytes: AtomicU64,
pub free_count: AtomicU64,
pub free_bytes: AtomicU64,
pub cpu_samples: AtomicU64,
pub stack_depth: AtomicU32,
pub _reserved: u32,
pub stack: [AtomicU64; MAX_STACK_DEPTH],
}
#[repr(C)]
pub struct AllocEntry {
pub ptr: AtomicU64,
pub size: AtomicU64,
pub callsite_hash: AtomicU64,
}
#[repr(C)]
pub struct StatsHeader {
pub magic: u64,
pub version: u32,
pub callsite_capacity: u32,
pub alloc_table_capacity: u32,
pub pid: u32,
}
static INITIALIZED: AtomicBool = AtomicBool::new(false);
static IN_SIGNAL_HANDLER: AtomicBool = AtomicBool::new(false);
static mut SHM_BASE: *mut u8 = core::ptr::null_mut();
#[inline]
fn get_header() -> *mut StatsHeader {
unsafe { SHM_BASE as *mut StatsHeader }
}
#[inline]
fn get_callsites() -> *mut CallsiteStats {
unsafe { SHM_BASE.add(core::mem::size_of::<StatsHeader>()) as *mut CallsiteStats }
}
#[inline]
fn get_alloc_table() -> *mut AllocEntry {
let callsites_size = CALLSITE_CAPACITY * core::mem::size_of::<CallsiteStats>();
unsafe {
SHM_BASE
.add(core::mem::size_of::<StatsHeader>())
.add(callsites_size) as *mut AllocEntry
}
}
#[inline]
fn shm_ready() -> bool {
unsafe { !SHM_BASE.is_null() }
}
#[inline]
fn stack_key_heap(stack: &[u64], depth: u32) -> u64 {
let mut key = 0u64;
let skip = 4.min(depth as usize);
let take = 8.min((depth as usize).saturating_sub(skip));
for i in 0..take {
let addr = stack[skip + i];
key ^= addr;
key = key.wrapping_mul(0x100000001b3);
}
if key == 0 {
key = 1;
}
key
}
#[inline]
fn stack_key_cpu(stack: &[u64], depth: u32) -> u64 {
let mut key = 0u64;
let take = 6.min(depth as usize);
for i in 0..take {
let addr = stack[i];
key ^= addr;
key = key.wrapping_mul(0x100000001b3);
}
if key == 0 {
key = 1;
}
key
}
#[inline]
fn find_or_create_callsite(
hash: u64,
stack: &[u64; MAX_STACK_DEPTH],
depth: u32,
) -> *mut CallsiteStats {
let callsites = get_callsites();
let mut idx = (hash as usize) % CALLSITE_CAPACITY;
for _ in 0..CALLSITE_CAPACITY {
let entry = unsafe { callsites.add(idx) };
let stored_hash = unsafe { (*entry).hash.load(Ordering::Acquire) };
if stored_hash == hash {
return entry;
}
if stored_hash == 0 {
if unsafe {
(*entry)
.hash
.compare_exchange(0, hash, Ordering::AcqRel, Ordering::Relaxed)
.is_ok()
} {
unsafe {
(*entry).stack_depth.store(depth, Ordering::Relaxed);
for i in 0..(depth as usize).min(MAX_STACK_DEPTH) {
(*entry).stack[i].store(stack[i], Ordering::Relaxed);
}
}
return entry;
}
let new_hash = unsafe { (*entry).hash.load(Ordering::Acquire) };
if new_hash == hash {
return entry;
}
}
idx = (idx + 1) % CALLSITE_CAPACITY;
}
callsites
}
#[inline]
fn find_callsite(hash: u64) -> *mut CallsiteStats {
let callsites = get_callsites();
let mut idx = (hash as usize) % CALLSITE_CAPACITY;
for _ in 0..CALLSITE_CAPACITY {
let entry = unsafe { callsites.add(idx) };
let stored_hash = unsafe { (*entry).hash.load(Ordering::Acquire) };
if stored_hash == hash {
return entry;
}
if stored_hash == 0 {
return core::ptr::null_mut();
}
idx = (idx + 1) % CALLSITE_CAPACITY;
}
core::ptr::null_mut()
}
#[inline]
fn track_alloc(ptr: u64, size: u64, callsite_hash: u64) {
let alloc_table = get_alloc_table();
let mut idx = ((ptr >> 4) as usize) % ALLOC_TABLE_CAPACITY;
for _ in 0..1024 {
let entry = unsafe { alloc_table.add(idx) };
let stored_ptr = unsafe { (*entry).ptr.load(Ordering::Acquire) };
if stored_ptr == 0 || stored_ptr == TOMBSTONE {
if unsafe {
(*entry)
.ptr
.compare_exchange(stored_ptr, ptr, Ordering::AcqRel, Ordering::Relaxed)
.is_ok()
} {
unsafe {
(*entry).size.store(size, Ordering::Relaxed);
(*entry)
.callsite_hash
.store(callsite_hash, Ordering::Release);
}
return;
}
}
idx = (idx + 1) % ALLOC_TABLE_CAPACITY;
}
}
#[inline]
fn untrack_alloc(ptr: u64) -> Option<(u64, u64)> {
let alloc_table = get_alloc_table();
let mut idx = ((ptr >> 4) as usize) % ALLOC_TABLE_CAPACITY;
for _ in 0..1024 {
let entry = unsafe { alloc_table.add(idx) };
let stored_ptr = unsafe { (*entry).ptr.load(Ordering::Acquire) };
if stored_ptr == ptr {
let size = unsafe { (*entry).size.load(Ordering::Relaxed) };
let callsite_hash = unsafe { (*entry).callsite_hash.load(Ordering::Acquire) };
unsafe { (*entry).ptr.store(TOMBSTONE, Ordering::Release) };
return Some((size, callsite_hash));
}
if stored_ptr == 0 {
return None;
}
idx = (idx + 1) % ALLOC_TABLE_CAPACITY;
}
None
}
pub fn init() {
if INITIALIZED.swap(true, Ordering::SeqCst) {
return;
}
unsafe {
let header_size = core::mem::size_of::<StatsHeader>();
let callsites_size = CALLSITE_CAPACITY * core::mem::size_of::<CallsiteStats>();
let alloc_table_size = ALLOC_TABLE_CAPACITY * core::mem::size_of::<AllocEntry>();
let total_size = header_size + callsites_size + alloc_table_size;
libc::shm_unlink(SHM_PATH.as_ptr() as *const libc::c_char);
let fd = libc::shm_open(
SHM_PATH.as_ptr() as *const libc::c_char,
libc::O_CREAT | libc::O_RDWR | libc::O_EXCL,
0o666,
);
if fd < 0 {
INITIALIZED.store(false, Ordering::SeqCst);
return;
}
if libc::ftruncate(fd, total_size as libc::off_t) < 0 {
libc::close(fd);
INITIALIZED.store(false, Ordering::SeqCst);
return;
}
let ptr = libc::mmap(
core::ptr::null_mut(),
total_size,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_SHARED,
fd,
0,
);
libc::close(fd);
if ptr == libc::MAP_FAILED {
INITIALIZED.store(false, Ordering::SeqCst);
return;
}
SHM_BASE = ptr as *mut u8;
core::ptr::write_bytes(SHM_BASE, 0, total_size);
let header = get_header();
(*header).magic = MAGIC;
(*header).version = VERSION;
(*header).callsite_capacity = CALLSITE_CAPACITY as u32;
(*header).alloc_table_capacity = ALLOC_TABLE_CAPACITY as u32;
(*header).pid = libc::getpid() as u32;
}
}
#[inline(never)]
fn capture_stack(stack: &mut [u64; MAX_STACK_DEPTH]) -> u32 {
capture_stack_from_fp(stack, core::ptr::null())
}
#[inline(never)]
fn capture_stack_from_fp(stack: &mut [u64; MAX_STACK_DEPTH], start_fp: *const usize) -> u32 {
let mut depth = 0u32;
unsafe {
let mut fp: *const usize = if start_fp.is_null() {
let current_fp: *const usize;
core::arch::asm!(
"mov {}, rbp",
out(reg) current_fp,
options(nomem, nostack, preserves_flags)
);
current_fp
} else {
start_fp
};
while !fp.is_null() && depth < MAX_STACK_DEPTH as u32 {
if (fp as usize) & 0x7 != 0 {
break;
}
let fp_val = fp as usize;
if !(0x1000..=0x7fff_ffff_ffff).contains(&fp_val) {
break;
}
let ret_addr = *fp.add(1);
if ret_addr == 0 {
break;
}
stack[depth as usize] = ret_addr as u64;
depth += 1;
let next_fp = *fp as *const usize;
if next_fp <= fp {
break;
}
fp = next_fp;
}
}
depth
}
#[cfg(feature = "heap")]
#[inline(never)]
pub fn record_alloc(ptr: *mut u8, size: usize) {
if IN_SIGNAL_HANDLER.load(Ordering::Relaxed) {
return;
}
if !INITIALIZED.load(Ordering::Relaxed) {
init();
}
if !shm_ready() {
return;
}
let mut stack = [0u64; MAX_STACK_DEPTH];
let depth = capture_stack(&mut stack);
let hash = stack_key_heap(&stack, depth);
let callsite = find_or_create_callsite(hash, &stack, depth);
unsafe {
(*callsite).alloc_count.fetch_add(1, Ordering::Relaxed);
(*callsite)
.alloc_bytes
.fetch_add(size as u64, Ordering::Relaxed);
}
track_alloc(ptr as u64, size as u64, hash);
}
#[cfg(feature = "heap")]
#[inline(never)]
pub fn record_dealloc(ptr: *mut u8, _size: usize) {
if IN_SIGNAL_HANDLER.load(Ordering::Relaxed) {
return;
}
if !INITIALIZED.load(Ordering::Relaxed) || !shm_ready() {
return;
}
if let Some((size, callsite_hash)) = untrack_alloc(ptr as u64) {
let callsite = find_callsite(callsite_hash);
if !callsite.is_null() {
unsafe {
(*callsite).free_count.fetch_add(1, Ordering::Relaxed);
(*callsite).free_bytes.fetch_add(size, Ordering::Relaxed);
}
}
}
}
#[cfg(not(feature = "heap"))]
#[inline]
pub fn record_alloc(_ptr: *mut u8, _size: usize) {}
#[cfg(not(feature = "heap"))]
#[inline]
pub fn record_dealloc(_ptr: *mut u8, _size: usize) {}
#[cfg(feature = "cpu")]
mod cpu_profiling {
use super::*;
const DEFAULT_FREQ_HZ: u32 = 99;
extern "C" fn cpu_sample_handler(
_sig: libc::c_int,
_info: *mut libc::siginfo_t,
ucontext: *mut libc::c_void,
) {
if IN_SIGNAL_HANDLER.swap(true, Ordering::SeqCst) {
return;
}
if !shm_ready() {
IN_SIGNAL_HANDLER.store(false, Ordering::SeqCst);
return;
}
let (rip, start_fp) = if !ucontext.is_null() {
unsafe {
let uc = ucontext as *const libc::ucontext_t;
const REG_RIP: usize = 16;
const REG_RBP: usize = 10;
let rip = (*uc).uc_mcontext.gregs[REG_RIP] as u64;
let rbp = (*uc).uc_mcontext.gregs[REG_RBP] as usize;
(rip, rbp as *const usize)
}
} else {
(0, core::ptr::null())
};
let mut stack = [0u64; MAX_STACK_DEPTH];
let mut depth = 0u32;
if rip != 0 {
stack[0] = rip;
depth = 1;
}
if !start_fp.is_null() {
let mut fp = start_fp;
while !fp.is_null() && (depth as usize) < MAX_STACK_DEPTH {
if (fp as usize) & 0x7 != 0 {
break;
}
let fp_val = fp as usize;
if !(0x1000..=0x7fff_ffff_ffff).contains(&fp_val) {
break;
}
let ret_addr = unsafe { *fp.add(1) };
if ret_addr == 0 {
break;
}
stack[depth as usize] = ret_addr as u64;
depth += 1;
let next_fp = unsafe { *fp as *const usize };
if next_fp <= fp {
break;
}
fp = next_fp;
}
}
let hash = stack_key_cpu(&stack, depth);
let callsite = find_or_create_callsite(hash, &stack, depth);
unsafe { (*callsite).cpu_samples.fetch_add(1, Ordering::Relaxed) };
IN_SIGNAL_HANDLER.store(false, Ordering::SeqCst);
}
pub fn start_cpu_profiling(freq_hz: u32) {
if !INITIALIZED.load(Ordering::Relaxed) {
init();
}
unsafe {
let mut sa: libc::sigaction = core::mem::zeroed();
sa.sa_sigaction = cpu_sample_handler as *const () as usize;
sa.sa_flags = libc::SA_RESTART | libc::SA_SIGINFO;
libc::sigemptyset(&mut sa.sa_mask);
if libc::sigaction(libc::SIGPROF, &sa, core::ptr::null_mut()) < 0 {
return;
}
let freq = if freq_hz == 0 {
DEFAULT_FREQ_HZ
} else {
freq_hz
};
let interval_usec = 1_000_000 / freq as i64;
let timer = libc::itimerval {
it_interval: libc::timeval {
tv_sec: 0,
tv_usec: interval_usec,
},
it_value: libc::timeval {
tv_sec: 0,
tv_usec: interval_usec,
},
};
libc::setitimer(libc::ITIMER_PROF, &timer, core::ptr::null_mut());
}
}
pub fn stop_cpu_profiling() {
unsafe {
let timer = libc::itimerval {
it_interval: libc::timeval {
tv_sec: 0,
tv_usec: 0,
},
it_value: libc::timeval {
tv_sec: 0,
tv_usec: 0,
},
};
libc::setitimer(libc::ITIMER_PROF, &timer, core::ptr::null_mut());
let mut sa: libc::sigaction = core::mem::zeroed();
sa.sa_sigaction = libc::SIG_DFL;
libc::sigaction(libc::SIGPROF, &sa, core::ptr::null_mut());
}
}
}
#[cfg(feature = "cpu")]
pub use cpu_profiling::{start_cpu_profiling, stop_cpu_profiling};
#[cfg(not(feature = "cpu"))]
pub fn start_cpu_profiling(_freq_hz: u32) {}
#[cfg(not(feature = "cpu"))]
pub fn stop_cpu_profiling() {}